content-core 0.6.0__tar.gz → 0.7.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (59) hide show
  1. {content_core-0.6.0 → content_core-0.7.2}/PKG-INFO +3 -5
  2. {content_core-0.6.0 → content_core-0.7.2}/pyproject.toml +4 -5
  3. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/state.py +9 -2
  4. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/extraction/graph.py +2 -9
  5. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/url.py +52 -0
  6. {content_core-0.6.0 → content_core-0.7.2}/uv.lock +1019 -990
  7. {content_core-0.6.0 → content_core-0.7.2}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  8. {content_core-0.6.0 → content_core-0.7.2}/.github/workflows/publish.yml +0 -0
  9. {content_core-0.6.0 → content_core-0.7.2}/.gitignore +0 -0
  10. {content_core-0.6.0 → content_core-0.7.2}/.python-version +0 -0
  11. {content_core-0.6.0 → content_core-0.7.2}/CONTRIBUTING.md +0 -0
  12. {content_core-0.6.0 → content_core-0.7.2}/LICENSE +0 -0
  13. {content_core-0.6.0 → content_core-0.7.2}/Makefile +0 -0
  14. {content_core-0.6.0 → content_core-0.7.2}/README.md +0 -0
  15. {content_core-0.6.0 → content_core-0.7.2}/docs/processors.md +0 -0
  16. {content_core-0.6.0 → content_core-0.7.2}/docs/usage.md +0 -0
  17. {content_core-0.6.0 → content_core-0.7.2}/prompts/content/cleanup.jinja +0 -0
  18. {content_core-0.6.0 → content_core-0.7.2}/prompts/content/summarize.jinja +0 -0
  19. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/__init__.py +0 -0
  20. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/cc_config.yaml +0 -0
  21. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/__init__.py +0 -0
  22. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/exceptions.py +0 -0
  23. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/common/utils.py +0 -0
  24. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/config.py +0 -0
  25. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/__init__.py +0 -0
  26. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/cleanup/__init__.py +0 -0
  27. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/cleanup/core.py +0 -0
  28. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/extraction/__init__.py +0 -0
  29. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/summary/__init__.py +0 -0
  30. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/content/summary/core.py +0 -0
  31. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/logging.py +0 -0
  32. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/models.py +0 -0
  33. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/models_config.yaml +0 -0
  34. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/notebooks/run.ipynb +0 -0
  35. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/audio.py +0 -0
  36. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/docling.py +0 -0
  37. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/office.py +0 -0
  38. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/pdf.py +0 -0
  39. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/text.py +0 -0
  40. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/video.py +0 -0
  41. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/processors/youtube.py +0 -0
  42. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/py.typed +0 -0
  43. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/templated_message.py +0 -0
  44. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/__init__.py +0 -0
  45. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/cleanup.py +0 -0
  46. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/extract.py +0 -0
  47. {content_core-0.6.0 → content_core-0.7.2}/src/content_core/tools/summarize.py +0 -0
  48. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.docx +0 -0
  49. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.epub +0 -0
  50. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.md +0 -0
  51. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.mp3 +0 -0
  52. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.mp4 +0 -0
  53. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.pdf +0 -0
  54. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.pptx +0 -0
  55. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.txt +0 -0
  56. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file.xlsx +0 -0
  57. {content_core-0.6.0 → content_core-0.7.2}/tests/input_content/file_audio.mp3 +0 -0
  58. {content_core-0.6.0 → content_core-0.7.2}/tests/integration/test_extraction.py +0 -0
  59. {content_core-0.6.0 → content_core-0.7.2}/tests/unit/test_docling.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.6.0
3
+ Version: 0.7.2
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -9,14 +9,12 @@ Requires-Dist: ai-prompter>=0.2.3
9
9
  Requires-Dist: aiohttp>=3.11
10
10
  Requires-Dist: bs4>=0.0.2
11
11
  Requires-Dist: dicttoxml>=1.7.16
12
- Requires-Dist: esperanto>=1.2.0
13
- Requires-Dist: google-genai>=1.10.0
12
+ Requires-Dist: esperanto[openai]>=1.2.0
14
13
  Requires-Dist: jinja2>=3.1.6
15
14
  Requires-Dist: langdetect>=1.0.9
16
15
  Requires-Dist: langgraph>=0.3.29
17
16
  Requires-Dist: loguru>=0.7.3
18
17
  Requires-Dist: moviepy>=2.1.2
19
- Requires-Dist: openai>=1.73.0
20
18
  Requires-Dist: openpyxl>=3.1.5
21
19
  Requires-Dist: pandas>=2.2.3
22
20
  Requires-Dist: pymupdf>=1.25.5
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
28
26
  Requires-Dist: youtube-transcript-api>=1.0.3
29
27
  Provides-Extra: docling
30
28
  Requires-Dist: asciidoc; extra == 'docling'
31
- Requires-Dist: docling[ocr]; extra == 'docling'
29
+ Requires-Dist: docling; extra == 'docling'
32
30
  Requires-Dist: pandas; extra == 'docling'
33
31
  Requires-Dist: pillow; extra == 'docling'
34
32
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "0.6.0"
3
+ version = "0.7.2"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -11,12 +11,10 @@ requires-python = ">=3.10"
11
11
  dependencies = [
12
12
  "aiohttp>=3.11",
13
13
  "bs4>=0.0.2",
14
- "esperanto>=1.2.0",
15
- "google-genai>=1.10.0",
14
+ "esperanto[openai]>=1.2.0",
16
15
  "jinja2>=3.1.6",
17
16
  "langdetect>=1.0.9",
18
17
  "loguru>=0.7.3",
19
- "openai>=1.73.0",
20
18
  "openpyxl>=3.1.5",
21
19
  "pandas>=2.2.3",
22
20
  "pymupdf>=1.25.5",
@@ -33,7 +31,7 @@ dependencies = [
33
31
  ]
34
32
 
35
33
  [project.optional-dependencies]
36
- docling = ["docling[ocr]", "Pillow", "pandas", "asciidoc"]
34
+ docling = ["docling", "Pillow", "pandas", "asciidoc"]
37
35
 
38
36
  [project.scripts]
39
37
  ccore = "content_core:ccore"
@@ -54,6 +52,7 @@ package-dir = {"content_core" = "src/content_core"}
54
52
  dev = [
55
53
  "ipykernel>=4.0.1",
56
54
  "ipywidgets>=4.0.0",
55
+ "openai>=1.78.1",
57
56
  "pyperclip>=1.9.0",
58
57
  "pytest>=7.2.0",
59
58
  "pytest-asyncio>=0.21.0",
@@ -13,8 +13,13 @@ class ProcessSourceState(BaseModel):
13
13
  identified_provider: Optional[str] = ""
14
14
  metadata: Optional[dict] = Field(default_factory=lambda: {})
15
15
  content: Optional[str] = ""
16
- engine: Optional[str] = Field(default=None, description="Override extraction engine: 'legacy' or 'docling'")
17
- output_format: Optional[str] = Field(default=None, description="Override Docling output format: 'markdown', 'html', or 'json'")
16
+ engine: Optional[str] = Field(
17
+ default=None, description="Override extraction engine: 'legacy' or 'docling'"
18
+ )
19
+ output_format: Optional[str] = Field(
20
+ default=None,
21
+ description="Override Docling output format: 'markdown', 'html', or 'json'",
22
+ )
18
23
 
19
24
 
20
25
  class ProcessSourceInput(BaseModel):
@@ -27,6 +32,8 @@ class ProcessSourceInput(BaseModel):
27
32
 
28
33
  class ProcessSourceOutput(BaseModel):
29
34
  title: Optional[str] = ""
35
+ file_path: Optional[str] = ""
36
+ url: Optional[str] = ""
30
37
  source_type: Optional[str] = ""
31
38
  identified_type: Optional[str] = ""
32
39
  identified_provider: Optional[str] = ""
@@ -15,10 +15,8 @@ from content_core.common import (
15
15
  from content_core.config import CONFIG # type: ignore
16
16
  from content_core.logging import logger
17
17
  from content_core.processors.audio import extract_audio_data # type: ignore
18
- from content_core.processors.docling import (
19
- DOCLING_SUPPORTED, # type: ignore
20
- extract_with_docling,
21
- )
18
+ from content_core.processors.docling import DOCLING_SUPPORTED # type: ignore
19
+ from content_core.processors.docling import extract_with_docling
22
20
  from content_core.processors.office import (
23
21
  SUPPORTED_OFFICE_TYPES,
24
22
  extract_office_content,
@@ -186,8 +184,3 @@ workflow.add_edge("download_remote_file", "file_type")
186
184
 
187
185
  # Compile graph
188
186
  graph = workflow.compile()
189
-
190
- # Compile graph
191
- graph = workflow.compile()
192
- # Compile graph
193
- graph = workflow.compile()
@@ -1,7 +1,9 @@
1
1
  import re
2
2
  from urllib.parse import urlparse
3
+ from io import BytesIO
3
4
 
4
5
  import aiohttp
6
+ import docx
5
7
  from bs4 import BeautifulSoup, Comment
6
8
 
7
9
  from content_core.common import ProcessSourceState
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
12
14
  # https://github.com/buriy/python-readability
13
15
  # also try readability: from readability import Document
14
16
 
17
+ DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
18
+
19
+ async def _extract_docx_content(docx_bytes: bytes, url: str):
20
+ """
21
+ Extract content from DOCX file bytes.
22
+ """
23
+ try:
24
+ logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
25
+ doc = docx.Document(BytesIO(docx_bytes))
26
+ content_parts = [p.text for p in doc.paragraphs if p.text]
27
+ full_content = "\n\n".join(content_parts)
28
+
29
+ # Try to get a title from document properties or first heading
30
+ title = doc.core_properties.title
31
+ if not title and doc.paragraphs:
32
+ # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
33
+ for p in doc.paragraphs[:5]: # Check first 5 paragraphs
34
+ if p.style.name.startswith('Heading'):
35
+ title = p.text
36
+ break
37
+ if not title: # Fallback to first line if no heading found
38
+ title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
39
+
40
+ # If no title found, use filename from URL
41
+ if not title:
42
+ title = urlparse(url).path.split('/')[-1]
43
+
44
+ logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
45
+ return {
46
+ "title": title,
47
+ "content": full_content,
48
+ "domain": urlparse(url).netloc,
49
+ "url": url,
50
+ }
51
+ except Exception as e:
52
+ logger.error(f"Failed to process DOCX content from {url}: {e}")
53
+ # Fallback or re-raise, depending on desired error handling
54
+ return {
55
+ "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
56
+ "content": f"Failed to extract content from DOCX: {e}",
57
+ "domain": urlparse(url).netloc,
58
+ "url": url,
59
+ }
15
60
 
16
61
  async def url_provider(state: ProcessSourceState):
17
62
  """
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
54
99
  async with aiohttp.ClientSession() as session:
55
100
  async with session.get(url, headers=headers, timeout=10) as response:
56
101
  response.raise_for_status()
102
+ # Check content type for DOCX
103
+ if response.content_type == DOCX_MIME_TYPE:
104
+ logger.debug(f"Detected DOCX content type for {url}")
105
+ docx_bytes = await response.read()
106
+ return await _extract_docx_content(docx_bytes, url)
107
+
108
+ # If not DOCX, proceed as HTML
57
109
  html_content = await response.text()
58
110
 
59
111
  soup = BeautifulSoup(html_content, "html.parser")