content-core 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -15,10 +15,8 @@ from content_core.common import (
15
15
  from content_core.config import CONFIG # type: ignore
16
16
  from content_core.logging import logger
17
17
  from content_core.processors.audio import extract_audio_data # type: ignore
18
- from content_core.processors.docling import (
19
- DOCLING_SUPPORTED, # type: ignore
20
- extract_with_docling,
21
- )
18
+ from content_core.processors.docling import DOCLING_SUPPORTED # type: ignore
19
+ from content_core.processors.docling import extract_with_docling
22
20
  from content_core.processors.office import (
23
21
  SUPPORTED_OFFICE_TYPES,
24
22
  extract_office_content,
@@ -186,8 +184,3 @@ workflow.add_edge("download_remote_file", "file_type")
186
184
 
187
185
  # Compile graph
188
186
  graph = workflow.compile()
189
-
190
- # Compile graph
191
- graph = workflow.compile()
192
- # Compile graph
193
- graph = workflow.compile()
@@ -1,7 +1,9 @@
1
1
  import re
2
2
  from urllib.parse import urlparse
3
+ from io import BytesIO
3
4
 
4
5
  import aiohttp
6
+ import docx
5
7
  from bs4 import BeautifulSoup, Comment
6
8
 
7
9
  from content_core.common import ProcessSourceState
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
12
14
  # https://github.com/buriy/python-readability
13
15
  # also try readability: from readability import Document
14
16
 
17
+ DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
18
+
19
+ async def _extract_docx_content(docx_bytes: bytes, url: str):
20
+ """
21
+ Extract content from DOCX file bytes.
22
+ """
23
+ try:
24
+ logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
25
+ doc = docx.Document(BytesIO(docx_bytes))
26
+ content_parts = [p.text for p in doc.paragraphs if p.text]
27
+ full_content = "\n\n".join(content_parts)
28
+
29
+ # Try to get a title from document properties or first heading
30
+ title = doc.core_properties.title
31
+ if not title and doc.paragraphs:
32
+ # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
33
+ for p in doc.paragraphs[:5]: # Check first 5 paragraphs
34
+ if p.style.name.startswith('Heading'):
35
+ title = p.text
36
+ break
37
+ if not title: # Fallback to first line if no heading found
38
+ title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
39
+
40
+ # If no title found, use filename from URL
41
+ if not title:
42
+ title = urlparse(url).path.split('/')[-1]
43
+
44
+ logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
45
+ return {
46
+ "title": title,
47
+ "content": full_content,
48
+ "domain": urlparse(url).netloc,
49
+ "url": url,
50
+ }
51
+ except Exception as e:
52
+ logger.error(f"Failed to process DOCX content from {url}: {e}")
53
+ # Fallback or re-raise, depending on desired error handling
54
+ return {
55
+ "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
56
+ "content": f"Failed to extract content from DOCX: {e}",
57
+ "domain": urlparse(url).netloc,
58
+ "url": url,
59
+ }
15
60
 
16
61
  async def url_provider(state: ProcessSourceState):
17
62
  """
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
54
99
  async with aiohttp.ClientSession() as session:
55
100
  async with session.get(url, headers=headers, timeout=10) as response:
56
101
  response.raise_for_status()
102
+ # Check content type for DOCX
103
+ if response.content_type == DOCX_MIME_TYPE:
104
+ logger.debug(f"Detected DOCX content type for {url}")
105
+ docx_bytes = await response.read()
106
+ return await _extract_docx_content(docx_bytes, url)
107
+
108
+ # If not DOCX, proceed as HTML
57
109
  html_content = await response.text()
58
110
 
59
111
  soup = BeautifulSoup(html_content, "html.parser")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -9,14 +9,12 @@ Requires-Dist: ai-prompter>=0.2.3
9
9
  Requires-Dist: aiohttp>=3.11
10
10
  Requires-Dist: bs4>=0.0.2
11
11
  Requires-Dist: dicttoxml>=1.7.16
12
- Requires-Dist: esperanto>=1.2.0
13
- Requires-Dist: google-genai>=1.10.0
12
+ Requires-Dist: esperanto[openai]>=1.2.0
14
13
  Requires-Dist: jinja2>=3.1.6
15
14
  Requires-Dist: langdetect>=1.0.9
16
15
  Requires-Dist: langgraph>=0.3.29
17
16
  Requires-Dist: loguru>=0.7.3
18
17
  Requires-Dist: moviepy>=2.1.2
19
- Requires-Dist: openai>=1.73.0
20
18
  Requires-Dist: openpyxl>=3.1.5
21
19
  Requires-Dist: pandas>=2.2.3
22
20
  Requires-Dist: pymupdf>=1.25.5
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
28
26
  Requires-Dist: youtube-transcript-api>=1.0.3
29
27
  Provides-Extra: docling
30
28
  Requires-Dist: asciidoc; extra == 'docling'
31
- Requires-Dist: docling[ocr]; extra == 'docling'
29
+ Requires-Dist: docling; extra == 'docling'
32
30
  Requires-Dist: pandas; extra == 'docling'
33
31
  Requires-Dist: pillow; extra == 'docling'
34
32
  Description-Content-Type: text/markdown
@@ -14,7 +14,7 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
14
14
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
15
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
16
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
- content_core/content/extraction/graph.py,sha256=d5Hp7GS2dFpYQIHFTIFhU-7ySZ3lfipdDxZZpe2DXS8,6361
17
+ content_core/content/extraction/graph.py,sha256=IKu-bV3YG2MigHnYixYYhtrQ-4qgGpETerXBEFn73zU,6304
18
18
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
19
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
20
  content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
@@ -23,15 +23,15 @@ content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_C
23
23
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
24
24
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
25
25
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
26
- content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
26
+ content_core/processors/url.py,sha256=vmkBVfJ1xpZQzlhRdkO64V1J9xdTBr6nrXY4M74QzEo,9094
27
27
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
28
28
  content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
29
29
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
30
30
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
31
31
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
32
32
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
33
- content_core-0.6.0.dist-info/METADATA,sha256=pn72ciBGpWE7tVvJ2j3NmQPmFB60cNrkHBmp5ziuyqk,10534
34
- content_core-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
- content_core-0.6.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
36
- content_core-0.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
37
- content_core-0.6.0.dist-info/RECORD,,
33
+ content_core-0.7.0.dist-info/METADATA,sha256=CFTVOA8hnMcofSlIlR-RwcCmvD9Hsa6mxFPjisBMKus,10471
34
+ content_core-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
+ content_core-0.7.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
36
+ content_core-0.7.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
37
+ content_core-0.7.0.dist-info/RECORD,,