content-core 0.5.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -1,13 +1,13 @@
1
1
  import asyncio
2
2
  import os
3
+ import tempfile
4
+ import math
5
+ import traceback
3
6
  from functools import partial
4
- from math import ceil
5
-
6
- from pydub import AudioSegment
7
+ from moviepy import AudioFileClip
7
8
 
8
9
  from content_core.common import ProcessSourceState
9
10
  from content_core.logging import logger
10
- from content_core.models import ModelFactory
11
11
 
12
12
  # todo: remove reference to model_manager
13
13
  # future: parallelize the transcription process
@@ -29,31 +29,29 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
29
29
  output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
30
30
 
31
31
  # Load the audio file
32
- audio = AudioSegment.from_file(input_file_abs)
32
+ audio = AudioFileClip(input_file_abs)
33
33
 
34
- # Calculate segment length in milliseconds
35
- segment_length_ms = segment_length_minutes * 60 * 1000
34
+ # Calculate segment length in seconds
35
+ segment_length_s = segment_length_minutes * 60
36
36
 
37
37
  # Calculate number of segments
38
- total_segments = ceil(len(audio) / segment_length_ms)
38
+ total_segments = math.ceil(audio.duration / segment_length_s)
39
39
  logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
40
40
 
41
41
  output_files = []
42
42
 
43
43
  # Split the audio into segments
44
44
  for i in range(total_segments):
45
- start_time = i * segment_length_ms
46
- end_time = min((i + 1) * segment_length_ms, len(audio))
45
+ start_time = i * segment_length_s
46
+ end_time = min((i + 1) * segment_length_s, audio.duration)
47
47
 
48
48
  # Extract segment
49
- segment = audio[start_time:end_time]
50
-
51
- # Generate output filename
52
49
  output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
53
50
  output_path = os.path.join(output_dir, output_filename)
54
51
 
55
52
  # Export segment
56
- segment.export(output_path, format="mp3")
53
+ extract_audio(input_file_abs, output_path, start_time, end_time)
54
+
57
55
  output_files.append(output_path)
58
56
 
59
57
  logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
@@ -66,43 +64,87 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
66
64
  )
67
65
 
68
66
 
67
+ def extract_audio(input_file: str, output_file: str, start_time: float = None, end_time: float = None) -> None:
68
+ """
69
+ Extract audio from a video or audio file and save it as an MP3 file.
70
+ If start_time and end_time are provided, only that segment of audio is extracted.
71
+
72
+ Args:
73
+ input_file (str): Path to the input video or audio file.
74
+ output_file (str): Path where the output MP3 file will be saved.
75
+ start_time (float, optional): Start time of the audio segment in seconds. Defaults to None.
76
+ end_time (float, optional): End time of the audio segment in seconds. Defaults to None.
77
+ """
78
+ try:
79
+ # Load the file as an AudioFileClip
80
+ audio_clip = AudioFileClip(input_file)
81
+
82
+ # If start_time and end_time are provided, trim the audio
83
+ if start_time is not None and end_time is not None:
84
+ audio_clip = audio_clip.cutout(0, start_time).cutout(end_time - start_time, audio_clip.duration)
85
+ elif start_time is not None:
86
+ audio_clip = audio_clip.cutout(0, start_time)
87
+ elif end_time is not None:
88
+ audio_clip = audio_clip.cutout(end_time, audio_clip.duration)
89
+
90
+ # Export the audio as MP3
91
+ audio_clip.write_audiofile(output_file, codec='mp3')
92
+ audio_clip.close()
93
+ except Exception as e:
94
+ logger.error(f"Error extracting audio: {str(e)}")
95
+ raise
96
+
97
+
69
98
  async def transcribe_audio_segment(audio_file, model):
70
99
  """Transcribe a single audio segment asynchronously"""
71
100
  return (await model.atranscribe(audio_file)).text
72
101
 
73
102
 
74
- async def extract_audio(data: ProcessSourceState):
103
+ async def extract_audio_data(data: ProcessSourceState):
75
104
  input_audio_path = data.file_path
76
- audio_files = []
77
105
 
78
106
  try:
79
- # Split audio into segments
80
- audio_files = await split_audio(input_audio_path)
107
+ # Create a temporary directory for audio segments
108
+ temp_dir = tempfile.mkdtemp()
109
+ output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
110
+ output_dir = temp_dir
111
+ os.makedirs(output_dir, exist_ok=True)
81
112
 
82
- # Transcribe all segments concurrently
83
- speech_to_text_model = ModelFactory.get_model("speech_to_text")
84
- transcribe_tasks = [
85
- transcribe_audio_segment(audio_file, speech_to_text_model)
86
- for audio_file in audio_files
87
- ]
88
- transcriptions = await asyncio.gather(*transcribe_tasks)
113
+ # Split audio into segments if longer than 10 minutes
114
+ audio = AudioFileClip(input_audio_path)
115
+ duration_s = audio.duration
116
+ segment_length_s = 10 * 60 # 10 minutes in seconds
117
+ output_files = []
118
+
119
+ if duration_s > segment_length_s:
120
+ logger.info(f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments")
121
+ for i in range(math.ceil(duration_s / segment_length_s)):
122
+ start_time = i * segment_length_s
123
+ end_time = min((i + 1) * segment_length_s, audio.duration)
89
124
 
90
- return {"content": " ".join(transcriptions)}
125
+ # Extract segment
126
+ output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
127
+ output_path = os.path.join(output_dir, output_filename)
91
128
 
129
+ extract_audio(input_audio_path, output_path, start_time, end_time)
130
+
131
+ output_files.append(output_path)
132
+ else:
133
+ output_files = [input_audio_path]
134
+
135
+ # Transcribe audio files
136
+ from content_core.models import ModelFactory
137
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
138
+ transcriptions = []
139
+ for audio_file in output_files:
140
+ transcription = await transcribe_audio_segment(audio_file, speech_to_text_model)
141
+ transcriptions.append(transcription)
142
+
143
+ return {
144
+ "metadata": {"audio_files": output_files},
145
+ "content": " ".join(transcriptions)
146
+ }
92
147
  except Exception as e:
93
- logger.error(f"Error transcribing audio: {str(e)}")
94
- logger.exception(e)
148
+ logger.error(f"Error processing audio: {str(e)}")
149
+ logger.error(traceback.format_exc())
95
150
  raise
96
-
97
- finally:
98
- # Clean up temporary files
99
- def _cleanup(files):
100
- for file in files:
101
- try:
102
- os.remove(file)
103
- except OSError as e:
104
- logger.error(f"Error removing temporary file {file}: {str(e)}")
105
-
106
- await asyncio.get_event_loop().run_in_executor(
107
- None, partial(_cleanup, audio_files)
108
- )
@@ -1,7 +1,9 @@
1
1
  import re
2
2
  from urllib.parse import urlparse
3
+ from io import BytesIO
3
4
 
4
5
  import aiohttp
6
+ import docx
5
7
  from bs4 import BeautifulSoup, Comment
6
8
 
7
9
  from content_core.common import ProcessSourceState
@@ -12,6 +14,49 @@ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES
12
14
  # https://github.com/buriy/python-readability
13
15
  # also try readability: from readability import Document
14
16
 
17
+ DOCX_MIME_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
18
+
19
+ async def _extract_docx_content(docx_bytes: bytes, url: str):
20
+ """
21
+ Extract content from DOCX file bytes.
22
+ """
23
+ try:
24
+ logger.debug(f"Attempting to parse DOCX from URL: {url} with python-docx")
25
+ doc = docx.Document(BytesIO(docx_bytes))
26
+ content_parts = [p.text for p in doc.paragraphs if p.text]
27
+ full_content = "\n\n".join(content_parts)
28
+
29
+ # Try to get a title from document properties or first heading
30
+ title = doc.core_properties.title
31
+ if not title and doc.paragraphs:
32
+ # Look for a potential title in the first few paragraphs (e.g., if styled as heading)
33
+ for p in doc.paragraphs[:5]: # Check first 5 paragraphs
34
+ if p.style.name.startswith('Heading'):
35
+ title = p.text
36
+ break
37
+ if not title: # Fallback to first line if no heading found
38
+ title = doc.paragraphs[0].text.strip() if doc.paragraphs[0].text.strip() else None
39
+
40
+ # If no title found, use filename from URL
41
+ if not title:
42
+ title = urlparse(url).path.split('/')[-1]
43
+
44
+ logger.info(f"Successfully extracted content from DOCX: {url}, Title: {title}")
45
+ return {
46
+ "title": title,
47
+ "content": full_content,
48
+ "domain": urlparse(url).netloc,
49
+ "url": url,
50
+ }
51
+ except Exception as e:
52
+ logger.error(f"Failed to process DOCX content from {url}: {e}")
53
+ # Fallback or re-raise, depending on desired error handling
54
+ return {
55
+ "title": f"Error Processing DOCX: {urlparse(url).path.split('/')[-1]}",
56
+ "content": f"Failed to extract content from DOCX: {e}",
57
+ "domain": urlparse(url).netloc,
58
+ "url": url,
59
+ }
15
60
 
16
61
  async def url_provider(state: ProcessSourceState):
17
62
  """
@@ -54,6 +99,13 @@ async def extract_url_bs4(url: str):
54
99
  async with aiohttp.ClientSession() as session:
55
100
  async with session.get(url, headers=headers, timeout=10) as response:
56
101
  response.raise_for_status()
102
+ # Check content type for DOCX
103
+ if response.content_type == DOCX_MIME_TYPE:
104
+ logger.debug(f"Detected DOCX content type for {url}")
105
+ docx_bytes = await response.read()
106
+ return await _extract_docx_content(docx_bytes, url)
107
+
108
+ # If not DOCX, proceed as HTML
57
109
  html_content = await response.text()
58
110
 
59
111
  soup = BeautifulSoup(html_content, "html.parser")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.5.1
3
+ Version: 0.7.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -9,16 +9,14 @@ Requires-Dist: ai-prompter>=0.2.3
9
9
  Requires-Dist: aiohttp>=3.11
10
10
  Requires-Dist: bs4>=0.0.2
11
11
  Requires-Dist: dicttoxml>=1.7.16
12
- Requires-Dist: esperanto>=1.2.0
13
- Requires-Dist: google-genai>=1.10.0
12
+ Requires-Dist: esperanto[openai]>=1.2.0
14
13
  Requires-Dist: jinja2>=3.1.6
15
14
  Requires-Dist: langdetect>=1.0.9
16
15
  Requires-Dist: langgraph>=0.3.29
17
16
  Requires-Dist: loguru>=0.7.3
18
- Requires-Dist: openai>=1.73.0
17
+ Requires-Dist: moviepy>=2.1.2
19
18
  Requires-Dist: openpyxl>=3.1.5
20
19
  Requires-Dist: pandas>=2.2.3
21
- Requires-Dist: pydub>=0.25.1
22
20
  Requires-Dist: pymupdf>=1.25.5
23
21
  Requires-Dist: python-docx>=1.1.2
24
22
  Requires-Dist: python-dotenv>=1.1.0
@@ -28,7 +26,7 @@ Requires-Dist: validators>=0.34.0
28
26
  Requires-Dist: youtube-transcript-api>=1.0.3
29
27
  Provides-Extra: docling
30
28
  Requires-Dist: asciidoc; extra == 'docling'
31
- Requires-Dist: docling[ocr]; extra == 'docling'
29
+ Requires-Dist: docling; extra == 'docling'
32
30
  Requires-Dist: pandas; extra == 'docling'
33
31
  Requires-Dist: pillow; extra == 'docling'
34
32
  Description-Content-Type: text/markdown
@@ -14,27 +14,24 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
14
14
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
15
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
16
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
- content_core/content/extraction/graph.py,sha256=Sp9XJ6AoLXA_FUFWhmfTMzOC2gkarp1Qg8MsIScLCok,6213
17
+ content_core/content/extraction/graph.py,sha256=IKu-bV3YG2MigHnYixYYhtrQ-4qgGpETerXBEFn73zU,6304
18
18
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
19
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
- content_core/notebooks/docling.ipynb,sha256=aTad8NORNd-TUMlbX58DURJ4-QCeplTeTT0vUj301m0,631
21
- content_core/notebooks/run.ipynb,sha256=lV8n1fx_kgIQHBnk1vR6ChBjMS5luAEuDDljsTBNjrQ,369490
22
- content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
20
+ content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
21
+ content_core/processors/audio.py,sha256=KnwxK85X9qRyVziMhFd103kfHkE8qGB1D4yW5lYO90E,5701
23
22
  content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
24
23
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
25
24
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
26
25
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
27
- content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
26
+ content_core/processors/url.py,sha256=vmkBVfJ1xpZQzlhRdkO64V1J9xdTBr6nrXY4M74QzEo,9094
28
27
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
29
28
  content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
30
- content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
31
- content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
32
29
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
33
30
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
34
31
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
35
32
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
36
- content_core-0.5.1.dist-info/METADATA,sha256=mkvdVcLsiBDGiobgswCVQF8Xkceq5VpIRZspniB61PY,10533
37
- content_core-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- content_core-0.5.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
39
- content_core-0.5.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
40
- content_core-0.5.1.dist-info/RECORD,,
33
+ content_core-0.7.0.dist-info/METADATA,sha256=CFTVOA8hnMcofSlIlR-RwcCmvD9Hsa6mxFPjisBMKus,10471
34
+ content_core-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
+ content_core-0.7.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
36
+ content_core-0.7.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
37
+ content_core-0.7.0.dist-info/RECORD,,
@@ -1,27 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "from docling.document_converter import DocumentConverter\n",
10
- "\n",
11
- "\n",
12
- "source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
13
- "source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
14
- "converter = DocumentConverter()\n",
15
- "result = converter.convert(source)\n",
16
- "print(result.document.export_to_markdown())"
17
- ]
18
- }
19
- ],
20
- "metadata": {
21
- "language_info": {
22
- "name": "python"
23
- }
24
- },
25
- "nbformat": 4,
26
- "nbformat_minor": 2
27
- }
@@ -1,16 +0,0 @@
1
- # GOAL
2
-
3
- Adjust the content below to make it clean and readable:
4
- Remove repeated strings that do not add value to the text.
5
-
6
- Remove any content unrelated to the text itself (e.g., metadata, artifacts, or extraction errors).
7
-
8
- Format the output as unstructured but clear text.
9
-
10
- Do not add extra text, introductions, conclusions, or commentary—only rewrite the provided content as it is.
11
-
12
- Do not interpret, analyze, or alter the meaning, intent, or narrative of the text—just reformat it for clarity and readability.
13
-
14
- Do not change the text structure, do not write conclusions about it. Your only job is to make it readable.
15
-
16
- Keep the text in its original language, regardless of what it is.
@@ -1,25 +0,0 @@
1
- You are an AI assistant for a personal study platform.
2
-
3
- In this platform, your user collects various articles and content from the Internet for reference and study.
4
-
5
- Your role is to summarize the selected content as densely as possible, helping the reader extract maximum value from it without reading the full text.
6
- Focus solely on the content's value, avoiding unnecessary comments or messages.
7
-
8
- The summary should be dense, rich in characters, and designed to create a powerful vector representation.
9
- If the user provided additional context, follow its instructions. Otherwise, summary the whole content.
10
-
11
- Do not return any acknowledgments or greetings—only the summary.
12
-
13
- CONTENT:
14
-
15
- {{ content }}
16
-
17
- {% if context %}
18
- CONTEXT:
19
-
20
- User has provided the aditional context for your task:
21
- {{context}}
22
- {% endif%}
23
-
24
-
25
- SUMMARY: