content-core 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -1,13 +1,13 @@
1
1
  import asyncio
2
2
  import os
3
+ import tempfile
4
+ import math
5
+ import traceback
3
6
  from functools import partial
4
- from math import ceil
5
-
6
- from pydub import AudioSegment
7
+ from moviepy import AudioFileClip
7
8
 
8
9
  from content_core.common import ProcessSourceState
9
10
  from content_core.logging import logger
10
- from content_core.models import ModelFactory
11
11
 
12
12
  # todo: remove reference to model_manager
13
13
  # future: parallelize the transcription process
@@ -29,31 +29,29 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
29
29
  output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
30
30
 
31
31
  # Load the audio file
32
- audio = AudioSegment.from_file(input_file_abs)
32
+ audio = AudioFileClip(input_file_abs)
33
33
 
34
- # Calculate segment length in milliseconds
35
- segment_length_ms = segment_length_minutes * 60 * 1000
34
+ # Calculate segment length in seconds
35
+ segment_length_s = segment_length_minutes * 60
36
36
 
37
37
  # Calculate number of segments
38
- total_segments = ceil(len(audio) / segment_length_ms)
38
+ total_segments = math.ceil(audio.duration / segment_length_s)
39
39
  logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
40
40
 
41
41
  output_files = []
42
42
 
43
43
  # Split the audio into segments
44
44
  for i in range(total_segments):
45
- start_time = i * segment_length_ms
46
- end_time = min((i + 1) * segment_length_ms, len(audio))
45
+ start_time = i * segment_length_s
46
+ end_time = min((i + 1) * segment_length_s, audio.duration)
47
47
 
48
48
  # Extract segment
49
- segment = audio[start_time:end_time]
50
-
51
- # Generate output filename
52
49
  output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
53
50
  output_path = os.path.join(output_dir, output_filename)
54
51
 
55
52
  # Export segment
56
- segment.export(output_path, format="mp3")
53
+ extract_audio(input_file_abs, output_path, start_time, end_time)
54
+
57
55
  output_files.append(output_path)
58
56
 
59
57
  logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
@@ -66,43 +64,87 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
66
64
  )
67
65
 
68
66
 
67
+ def extract_audio(input_file: str, output_file: str, start_time: float = None, end_time: float = None) -> None:
68
+ """
69
+ Extract audio from a video or audio file and save it as an MP3 file.
70
+ If start_time and end_time are provided, only that segment of audio is extracted.
71
+
72
+ Args:
73
+ input_file (str): Path to the input video or audio file.
74
+ output_file (str): Path where the output MP3 file will be saved.
75
+ start_time (float, optional): Start time of the audio segment in seconds. Defaults to None.
76
+ end_time (float, optional): End time of the audio segment in seconds. Defaults to None.
77
+ """
78
+ try:
79
+ # Load the file as an AudioFileClip
80
+ audio_clip = AudioFileClip(input_file)
81
+
82
+ # If start_time and end_time are provided, trim the audio
83
+ if start_time is not None and end_time is not None:
84
+ audio_clip = audio_clip.cutout(0, start_time).cutout(end_time - start_time, audio_clip.duration)
85
+ elif start_time is not None:
86
+ audio_clip = audio_clip.cutout(0, start_time)
87
+ elif end_time is not None:
88
+ audio_clip = audio_clip.cutout(end_time, audio_clip.duration)
89
+
90
+ # Export the audio as MP3
91
+ audio_clip.write_audiofile(output_file, codec='mp3')
92
+ audio_clip.close()
93
+ except Exception as e:
94
+ logger.error(f"Error extracting audio: {str(e)}")
95
+ raise
96
+
97
+
69
98
  async def transcribe_audio_segment(audio_file, model):
70
99
  """Transcribe a single audio segment asynchronously"""
71
100
  return (await model.atranscribe(audio_file)).text
72
101
 
73
102
 
74
- async def extract_audio(data: ProcessSourceState):
103
+ async def extract_audio_data(data: ProcessSourceState):
75
104
  input_audio_path = data.file_path
76
- audio_files = []
77
105
 
78
106
  try:
79
- # Split audio into segments
80
- audio_files = await split_audio(input_audio_path)
107
+ # Create a temporary directory for audio segments
108
+ temp_dir = tempfile.mkdtemp()
109
+ output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
110
+ output_dir = temp_dir
111
+ os.makedirs(output_dir, exist_ok=True)
81
112
 
82
- # Transcribe all segments concurrently
83
- speech_to_text_model = ModelFactory.get_model("speech_to_text")
84
- transcribe_tasks = [
85
- transcribe_audio_segment(audio_file, speech_to_text_model)
86
- for audio_file in audio_files
87
- ]
88
- transcriptions = await asyncio.gather(*transcribe_tasks)
113
+ # Split audio into segments if longer than 10 minutes
114
+ audio = AudioFileClip(input_audio_path)
115
+ duration_s = audio.duration
116
+ segment_length_s = 10 * 60 # 10 minutes in seconds
117
+ output_files = []
118
+
119
+ if duration_s > segment_length_s:
120
+ logger.info(f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments")
121
+ for i in range(math.ceil(duration_s / segment_length_s)):
122
+ start_time = i * segment_length_s
123
+ end_time = min((i + 1) * segment_length_s, audio.duration)
89
124
 
90
- return {"content": " ".join(transcriptions)}
125
+ # Extract segment
126
+ output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
127
+ output_path = os.path.join(output_dir, output_filename)
91
128
 
129
+ extract_audio(input_audio_path, output_path, start_time, end_time)
130
+
131
+ output_files.append(output_path)
132
+ else:
133
+ output_files = [input_audio_path]
134
+
135
+ # Transcribe audio files
136
+ from content_core.models import ModelFactory
137
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
138
+ transcriptions = []
139
+ for audio_file in output_files:
140
+ transcription = await transcribe_audio_segment(audio_file, speech_to_text_model)
141
+ transcriptions.append(transcription)
142
+
143
+ return {
144
+ "metadata": {"audio_files": output_files},
145
+ "content": " ".join(transcriptions)
146
+ }
92
147
  except Exception as e:
93
- logger.error(f"Error transcribing audio: {str(e)}")
94
- logger.exception(e)
148
+ logger.error(f"Error processing audio: {str(e)}")
149
+ logger.error(traceback.format_exc())
95
150
  raise
96
-
97
- finally:
98
- # Clean up temporary files
99
- def _cleanup(files):
100
- for file in files:
101
- try:
102
- os.remove(file)
103
- except OSError as e:
104
- logger.error(f"Error removing temporary file {file}: {str(e)}")
105
-
106
- await asyncio.get_event_loop().run_in_executor(
107
- None, partial(_cleanup, audio_files)
108
- )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -15,10 +15,10 @@ Requires-Dist: jinja2>=3.1.6
15
15
  Requires-Dist: langdetect>=1.0.9
16
16
  Requires-Dist: langgraph>=0.3.29
17
17
  Requires-Dist: loguru>=0.7.3
18
+ Requires-Dist: moviepy>=2.1.2
18
19
  Requires-Dist: openai>=1.73.0
19
20
  Requires-Dist: openpyxl>=3.1.5
20
21
  Requires-Dist: pandas>=2.2.3
21
- Requires-Dist: pydub>=0.25.1
22
22
  Requires-Dist: pymupdf>=1.25.5
23
23
  Requires-Dist: python-docx>=1.1.2
24
24
  Requires-Dist: python-dotenv>=1.1.0
@@ -14,12 +14,11 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
14
14
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
15
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
16
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
- content_core/content/extraction/graph.py,sha256=Sp9XJ6AoLXA_FUFWhmfTMzOC2gkarp1Qg8MsIScLCok,6213
17
+ content_core/content/extraction/graph.py,sha256=d5Hp7GS2dFpYQIHFTIFhU-7ySZ3lfipdDxZZpe2DXS8,6361
18
18
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
19
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
- content_core/notebooks/docling.ipynb,sha256=aTad8NORNd-TUMlbX58DURJ4-QCeplTeTT0vUj301m0,631
21
- content_core/notebooks/run.ipynb,sha256=lV8n1fx_kgIQHBnk1vR6ChBjMS5luAEuDDljsTBNjrQ,369490
22
- content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
20
+ content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
21
+ content_core/processors/audio.py,sha256=KnwxK85X9qRyVziMhFd103kfHkE8qGB1D4yW5lYO90E,5701
23
22
  content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
24
23
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
25
24
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
@@ -27,14 +26,12 @@ content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz
27
26
  content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
28
27
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
29
28
  content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
30
- content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
31
- content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
32
29
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
33
30
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
34
31
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
35
32
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
36
- content_core-0.5.1.dist-info/METADATA,sha256=mkvdVcLsiBDGiobgswCVQF8Xkceq5VpIRZspniB61PY,10533
37
- content_core-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- content_core-0.5.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
39
- content_core-0.5.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
40
- content_core-0.5.1.dist-info/RECORD,,
33
+ content_core-0.6.0.dist-info/METADATA,sha256=pn72ciBGpWE7tVvJ2j3NmQPmFB60cNrkHBmp5ziuyqk,10534
34
+ content_core-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
35
+ content_core-0.6.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
36
+ content_core-0.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
37
+ content_core-0.6.0.dist-info/RECORD,,
@@ -1,27 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "from docling.document_converter import DocumentConverter\n",
10
- "\n",
11
- "\n",
12
- "source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
13
- "source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
14
- "converter = DocumentConverter()\n",
15
- "result = converter.convert(source)\n",
16
- "print(result.document.export_to_markdown())"
17
- ]
18
- }
19
- ],
20
- "metadata": {
21
- "language_info": {
22
- "name": "python"
23
- }
24
- },
25
- "nbformat": 4,
26
- "nbformat_minor": 2
27
- }
@@ -1,16 +0,0 @@
1
- # GOAL
2
-
3
- Adjust the content below to make it clean and readable:
4
- Remove repeated strings that do not add value to the text.
5
-
6
- Remove any content unrelated to the text itself (e.g., metadata, artifacts, or extraction errors).
7
-
8
- Format the output as unstructured but clear text.
9
-
10
- Do not add extra text, introductions, conclusions, or commentary—only rewrite the provided content as it is.
11
-
12
- Do not interpret, analyze, or alter the meaning, intent, or narrative of the text—just reformat it for clarity and readability.
13
-
14
- Do not change the text structure, do not write conclusions about it. Your only job is to make it readable.
15
-
16
- Keep the text in its original language, regardless of what it is.
@@ -1,25 +0,0 @@
1
- You are an AI assistant for a personal study platform.
2
-
3
- In this platform, your user collects various articles and content from the Internet for reference and study.
4
-
5
- Your role is to summarize the selected content as densely as possible, helping the reader extract maximum value from it without reading the full text.
6
- Focus solely on the content's value, avoiding unnecessary comments or messages.
7
-
8
- The summary should be dense, rich in characters, and designed to create a powerful vector representation.
9
- If the user provided additional context, follow its instructions. Otherwise, summary the whole content.
10
-
11
- Do not return any acknowledgments or greetings—only the summary.
12
-
13
- CONTENT:
14
-
15
- {{ content }}
16
-
17
- {% if context %}
18
- CONTEXT:
19
-
20
- User has provided the aditional context for your task:
21
- {{context}}
22
- {% endif%}
23
-
24
-
25
- SUMMARY: