content-core 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/cc_config.yaml +4 -0
- content_core/content/extraction/graph.py +23 -7
- content_core/processors/docling.py +1 -1
- content_core/processors/url.py +11 -56
- content_core/processors/youtube.py +24 -7
- {content_core-0.8.1.dist-info → content_core-0.8.3.dist-info}/METADATA +1 -1
- {content_core-0.8.1.dist-info → content_core-0.8.3.dist-info}/RECORD +10 -10
- {content_core-0.8.1.dist-info → content_core-0.8.3.dist-info}/WHEEL +0 -0
- {content_core-0.8.1.dist-info → content_core-0.8.3.dist-info}/entry_points.txt +0 -0
- {content_core-0.8.1.dist-info → content_core-0.8.3.dist-info}/licenses/LICENSE +0 -0
content_core/cc_config.yaml
CHANGED
|
@@ -2,7 +2,6 @@ import os
|
|
|
2
2
|
import tempfile
|
|
3
3
|
from typing import Any, Dict, Optional
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
|
-
from content_core.common.types import warn_if_deprecated_engine
|
|
6
5
|
|
|
7
6
|
import aiohttp
|
|
8
7
|
import magic
|
|
@@ -13,11 +12,14 @@ from content_core.common import (
|
|
|
13
12
|
ProcessSourceState,
|
|
14
13
|
UnsupportedTypeException,
|
|
15
14
|
)
|
|
15
|
+
from content_core.common.types import warn_if_deprecated_engine
|
|
16
16
|
from content_core.config import CONFIG # type: ignore
|
|
17
17
|
from content_core.logging import logger
|
|
18
18
|
from content_core.processors.audio import extract_audio_data # type: ignore
|
|
19
|
-
from content_core.processors.docling import
|
|
20
|
-
|
|
19
|
+
from content_core.processors.docling import (
|
|
20
|
+
DOCLING_SUPPORTED, # type: ignore
|
|
21
|
+
extract_with_docling,
|
|
22
|
+
)
|
|
21
23
|
from content_core.processors.office import (
|
|
22
24
|
SUPPORTED_OFFICE_TYPES,
|
|
23
25
|
extract_office_content,
|
|
@@ -60,6 +62,7 @@ async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
60
62
|
async def file_type_edge(data: ProcessSourceState) -> str:
|
|
61
63
|
assert data.identified_type, "Type not identified"
|
|
62
64
|
identified_type = data.identified_type
|
|
65
|
+
logger.debug(f"Identified type: {identified_type}")
|
|
63
66
|
|
|
64
67
|
if identified_type == "text/plain":
|
|
65
68
|
return "extract_txt"
|
|
@@ -91,16 +94,19 @@ async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
|
|
|
91
94
|
|
|
92
95
|
|
|
93
96
|
async def url_type_router(x: ProcessSourceState) -> Optional[str]:
|
|
97
|
+
assert x.identified_type, "Type not identified"
|
|
94
98
|
return x.identified_type
|
|
95
99
|
|
|
96
100
|
|
|
97
101
|
async def source_type_router(x: ProcessSourceState) -> Optional[str]:
|
|
102
|
+
assert x.source_type, "Source type not identified"
|
|
98
103
|
return x.source_type
|
|
99
104
|
|
|
100
105
|
|
|
101
106
|
async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
102
107
|
url = state.url
|
|
103
108
|
assert url, "No URL provided"
|
|
109
|
+
logger.debug(f"Downloading remote file: {url}")
|
|
104
110
|
async with aiohttp.ClientSession() as session:
|
|
105
111
|
async with session.get(url) as resp:
|
|
106
112
|
resp.raise_for_status()
|
|
@@ -115,7 +121,6 @@ async def download_remote_file(state: ProcessSourceState) -> Dict[str, Any]:
|
|
|
115
121
|
return {"file_path": tmp, "identified_type": mime}
|
|
116
122
|
|
|
117
123
|
|
|
118
|
-
|
|
119
124
|
async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
120
125
|
"""
|
|
121
126
|
Route to Docling if enabled and supported; otherwise use simple file type edge.
|
|
@@ -125,18 +130,25 @@ async def file_type_router_docling(state: ProcessSourceState) -> str:
|
|
|
125
130
|
engine = state.engine or CONFIG.get("extraction", {}).get("engine", "auto")
|
|
126
131
|
warn_if_deprecated_engine(engine)
|
|
127
132
|
if engine == "auto":
|
|
133
|
+
logger.debug("Using auto engine")
|
|
128
134
|
# Try docling first; if it fails or is not supported, fallback to simple
|
|
129
135
|
if state.identified_type in DOCLING_SUPPORTED:
|
|
130
136
|
try:
|
|
137
|
+
logger.debug("Trying docling extraction")
|
|
131
138
|
return "extract_docling"
|
|
132
139
|
except Exception as e:
|
|
133
|
-
logger.warning(
|
|
140
|
+
logger.warning(
|
|
141
|
+
f"Docling extraction failed in 'auto' mode, falling back to simple: {e}"
|
|
142
|
+
)
|
|
134
143
|
# Fallback to simple
|
|
144
|
+
logger.debug("Falling back to simple extraction")
|
|
135
145
|
return await file_type_edge(state)
|
|
136
146
|
|
|
137
147
|
if engine == "docling" and state.identified_type in DOCLING_SUPPORTED:
|
|
148
|
+
logger.debug("Using docling engine")
|
|
138
149
|
return "extract_docling"
|
|
139
150
|
# For 'simple' and 'legacy', use the default file type edge
|
|
151
|
+
logger.debug("Using simple engine")
|
|
140
152
|
return await file_type_edge(state)
|
|
141
153
|
|
|
142
154
|
|
|
@@ -179,7 +191,12 @@ workflow.add_conditional_edges(
|
|
|
179
191
|
"url_provider",
|
|
180
192
|
url_type_router,
|
|
181
193
|
{
|
|
182
|
-
**{
|
|
194
|
+
**{
|
|
195
|
+
m: "download_remote_file"
|
|
196
|
+
for m in list(SUPPORTED_FITZ_TYPES)
|
|
197
|
+
+ list(SUPPORTED_OFFICE_TYPES)
|
|
198
|
+
+ list(DOCLING_SUPPORTED)
|
|
199
|
+
},
|
|
183
200
|
"article": "extract_url",
|
|
184
201
|
"youtube": "extract_youtube_transcript",
|
|
185
202
|
},
|
|
@@ -197,5 +214,4 @@ workflow.add_edge("extract_audio_data", "delete_file")
|
|
|
197
214
|
workflow.add_edge("delete_file", END)
|
|
198
215
|
workflow.add_edge("download_remote_file", "file_type")
|
|
199
216
|
|
|
200
|
-
# Compile graph
|
|
201
217
|
graph = workflow.compile()
|
|
@@ -26,7 +26,7 @@ DOCLING_SUPPORTED = {
|
|
|
26
26
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
27
27
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
28
28
|
"text/markdown",
|
|
29
|
-
"text/plain",
|
|
29
|
+
# "text/plain", #docling currently not supporting txt
|
|
30
30
|
"text/x-markdown",
|
|
31
31
|
"text/csv",
|
|
32
32
|
"text/html",
|
content_core/processors/url.py
CHANGED
|
@@ -1,68 +1,16 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from io import BytesIO
|
|
3
|
-
from urllib.parse import urlparse
|
|
4
2
|
|
|
5
3
|
import aiohttp
|
|
6
|
-
import docx
|
|
7
4
|
from bs4 import BeautifulSoup
|
|
8
5
|
from readability import Document
|
|
9
6
|
|
|
10
7
|
from content_core.common import ProcessSourceState
|
|
11
8
|
from content_core.common.types import warn_if_deprecated_engine
|
|
12
9
|
from content_core.logging import logger
|
|
10
|
+
from content_core.processors.docling import DOCLING_SUPPORTED
|
|
11
|
+
from content_core.processors.office import SUPPORTED_OFFICE_TYPES
|
|
13
12
|
from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
|
|
14
13
|
|
|
15
|
-
DOCX_MIME_TYPE = (
|
|
16
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
async def _extract_docx_content(docx_bytes: bytes, url: str):
|
|
21
|
-
"""
|
|
22
|
-
Extract content from DOCX file bytes.
|
|
23
|
-
"""
|
|
24
|
-
try:
|
|
25
|
-
logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
|
|
26
|
-
doc = docx.Document(BytesIO(docx_bytes))
|
|
27
|
-
content_parts = [p.text for p in doc.paragraphs if p.text]
|
|
28
|
-
full_content = "\n\n".join(content_parts)
|
|
29
|
-
|
|
30
|
-
# Try to get a title from document properties or first heading
|
|
31
|
-
title = doc.core_properties.title
|
|
32
|
-
if not title and doc.paragraphs:
|
|
33
|
-
# Look for a potential title in the first few paragraphs (e.g., if styled as heading)
|
|
34
|
-
for p in doc.paragraphs[:5]: # Check first 5 paragraphs
|
|
35
|
-
if p.style.name.startswith("Heading"):
|
|
36
|
-
title = p.text
|
|
37
|
-
break
|
|
38
|
-
if not title: # Fallback to first line if no heading found
|
|
39
|
-
title = (
|
|
40
|
-
doc.paragraphs[0].text.strip()
|
|
41
|
-
if doc.paragraphs[0].text.strip()
|
|
42
|
-
else None
|
|
43
|
-
)
|
|
44
|
-
|
|
45
|
-
# If no title found, use filename from URL
|
|
46
|
-
if not title:
|
|
47
|
-
title = urlparse(url).path.split("/")[-1]
|
|
48
|
-
|
|
49
|
-
logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
|
|
50
|
-
return {
|
|
51
|
-
"title": title,
|
|
52
|
-
"content": full_content,
|
|
53
|
-
"domain": urlparse(url).netloc,
|
|
54
|
-
"url": url,
|
|
55
|
-
}
|
|
56
|
-
except Exception as e:
|
|
57
|
-
logger.error(f"Failed to process DOCX content from {url}: {e}")
|
|
58
|
-
# Fallback or re-raise, depending on desired error handling
|
|
59
|
-
return {
|
|
60
|
-
"title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
|
|
61
|
-
"content": f"Failed to extract content from DOCX: {e}",
|
|
62
|
-
"domain": urlparse(url).netloc,
|
|
63
|
-
"url": url,
|
|
64
|
-
}
|
|
65
|
-
|
|
66
14
|
|
|
67
15
|
async def url_provider(state: ProcessSourceState):
|
|
68
16
|
"""
|
|
@@ -81,12 +29,19 @@ async def url_provider(state: ProcessSourceState):
|
|
|
81
29
|
url, timeout=10, allow_redirects=True
|
|
82
30
|
) as resp:
|
|
83
31
|
mime = resp.headers.get("content-type", "").split(";", 1)[0]
|
|
32
|
+
logger.debug(f"MIME type for {url}: {mime}")
|
|
84
33
|
except Exception as e:
|
|
85
|
-
logger.
|
|
34
|
+
logger.warning(f"HEAD check failed for {url}: {e}")
|
|
86
35
|
mime = "article"
|
|
87
|
-
if
|
|
36
|
+
if (
|
|
37
|
+
mime in DOCLING_SUPPORTED
|
|
38
|
+
or mime in SUPPORTED_FITZ_TYPES
|
|
39
|
+
or mime in SUPPORTED_OFFICE_TYPES
|
|
40
|
+
):
|
|
41
|
+
logger.warning(f"Identified type for {url}: {mime}")
|
|
88
42
|
return_dict["identified_type"] = mime
|
|
89
43
|
else:
|
|
44
|
+
logger.warning(f"Identified type for {url}: article")
|
|
90
45
|
return_dict["identified_type"] = "article"
|
|
91
46
|
return return_dict
|
|
92
47
|
|
|
@@ -8,6 +8,7 @@ from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
|
8
8
|
|
|
9
9
|
from content_core.common import ProcessSourceState
|
|
10
10
|
from content_core.common.exceptions import NoTranscriptFound
|
|
11
|
+
from content_core.config import CONFIG
|
|
11
12
|
from content_core.logging import logger
|
|
12
13
|
|
|
13
14
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
@@ -137,10 +138,11 @@ async def extract_youtube_transcript(state: ProcessSourceState):
|
|
|
137
138
|
Parse the text file and print its content.
|
|
138
139
|
"""
|
|
139
140
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
141
|
+
assert state.url, "No URL provided"
|
|
142
|
+
logger.warning(f"Extracting transcript from URL: {state.url}")
|
|
143
|
+
languages = CONFIG.get("youtube_transcripts", {}).get(
|
|
144
|
+
"preferred_languages", ["en", "es", "pt"]
|
|
145
|
+
)
|
|
144
146
|
|
|
145
147
|
video_id = await _extract_youtube_id(state.url)
|
|
146
148
|
transcript = await get_best_transcript(video_id, languages)
|
|
@@ -152,9 +154,24 @@ async def extract_youtube_transcript(state: ProcessSourceState):
|
|
|
152
154
|
except Exception as e:
|
|
153
155
|
logger.critical(f"Failed to get video title for video_id: {video_id}")
|
|
154
156
|
logger.exception(e)
|
|
155
|
-
title =
|
|
157
|
+
title = ""
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
formatted_content = formatter.format_transcript(transcript)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.critical(f"Failed to format transcript for video_id: {video_id}")
|
|
163
|
+
logger.exception(e)
|
|
164
|
+
formatted_content = ""
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
transcript_raw = transcript.to_raw_data()
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.critical(f"Failed to get raw transcript for video_id: {video_id}")
|
|
170
|
+
logger.exception(e)
|
|
171
|
+
transcript_raw = ""
|
|
172
|
+
|
|
156
173
|
return {
|
|
157
|
-
"content":
|
|
174
|
+
"content": formatted_content,
|
|
158
175
|
"title": title,
|
|
159
|
-
"metadata": {"video_id": video_id, "transcript":
|
|
176
|
+
"metadata": {"video_id": video_id, "transcript": transcript_raw},
|
|
160
177
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
|
|
2
|
-
content_core/cc_config.yaml,sha256=
|
|
2
|
+
content_core/cc_config.yaml,sha256=tfbnJ4h9DWuJUljJrnz72s6TD24hD5P-uEPA9K_pNVY,767
|
|
3
3
|
content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
|
|
@@ -15,24 +15,24 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
|
|
|
15
15
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
16
16
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
|
-
content_core/content/extraction/graph.py,sha256=
|
|
18
|
+
content_core/content/extraction/graph.py,sha256=Z8IqcFQmWLJG44jJ4399mBDQVMH-mYuQQpBDHTBUEe0,7571
|
|
19
19
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
20
20
|
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
21
21
|
content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
|
|
22
22
|
content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
|
|
23
|
-
content_core/processors/docling.py,sha256=
|
|
23
|
+
content_core/processors/docling.py,sha256=dkXehsQdfyWXfrK1K_6Pye50ABM7DxMk6TMguabM9Pc,2151
|
|
24
24
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
25
25
|
content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
|
|
26
26
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
27
|
-
content_core/processors/url.py,sha256=
|
|
27
|
+
content_core/processors/url.py,sha256=qdtEIhZpi62zMXbwbCmmh86ySoomscwqxHdFib7QC-M,7898
|
|
28
28
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
29
|
-
content_core/processors/youtube.py,sha256=
|
|
29
|
+
content_core/processors/youtube.py,sha256=g_A-rv5bzq2GIuwqMH70YAnDK-4BZqpgQP0IQ3j9zXE,6340
|
|
30
30
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
31
31
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
32
32
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
33
33
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
34
|
-
content_core-0.8.
|
|
35
|
-
content_core-0.8.
|
|
36
|
-
content_core-0.8.
|
|
37
|
-
content_core-0.8.
|
|
38
|
-
content_core-0.8.
|
|
34
|
+
content_core-0.8.3.dist-info/METADATA,sha256=NouAIeLanmxLdBOYuZGPOuWaEcz9nhsx7lQL9I6YEkI,11439
|
|
35
|
+
content_core-0.8.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
36
|
+
content_core-0.8.3.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
37
|
+
content_core-0.8.3.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
38
|
+
content_core-0.8.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|