content-core 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +6 -2
- content_core/content/extraction/graph.py +23 -13
- content_core/notebooks/run.ipynb +34 -42
- content_core/processors/audio.py +83 -41
- content_core/templated_message.py +16 -24
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/METADATA +3 -2
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/RECORD +10 -14
- content_core/notebooks/docling.ipynb +0 -27
- content_core/prompter.py +0 -159
- content_core/prompts/content/cleanup.jinja +0 -16
- content_core/prompts/content/summarize.jinja +0 -25
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/WHEEL +0 -0
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/entry_points.txt +0 -0
- {content_core-0.5.0.dist-info → content_core-0.6.0.dist-info}/licenses/LICENSE +0 -0
content_core/processors/audio.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
import math
|
|
5
|
+
import traceback
|
|
3
6
|
from functools import partial
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
from pydub import AudioSegment
|
|
7
|
+
from moviepy import AudioFileClip
|
|
7
8
|
|
|
8
9
|
from content_core.common import ProcessSourceState
|
|
9
10
|
from content_core.logging import logger
|
|
10
|
-
from content_core.models import ModelFactory
|
|
11
11
|
|
|
12
12
|
# todo: remove reference to model_manager
|
|
13
13
|
# future: parallelize the transcription process
|
|
@@ -29,31 +29,29 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
29
29
|
output_prefix = os.path.splitext(os.path.basename(input_file_abs))[0]
|
|
30
30
|
|
|
31
31
|
# Load the audio file
|
|
32
|
-
audio =
|
|
32
|
+
audio = AudioFileClip(input_file_abs)
|
|
33
33
|
|
|
34
|
-
# Calculate segment length in
|
|
35
|
-
|
|
34
|
+
# Calculate segment length in seconds
|
|
35
|
+
segment_length_s = segment_length_minutes * 60
|
|
36
36
|
|
|
37
37
|
# Calculate number of segments
|
|
38
|
-
total_segments = ceil(
|
|
38
|
+
total_segments = math.ceil(audio.duration / segment_length_s)
|
|
39
39
|
logger.debug(f"Splitting file: {input_file_abs} into {total_segments} segments")
|
|
40
40
|
|
|
41
41
|
output_files = []
|
|
42
42
|
|
|
43
43
|
# Split the audio into segments
|
|
44
44
|
for i in range(total_segments):
|
|
45
|
-
start_time = i *
|
|
46
|
-
end_time = min((i + 1) *
|
|
45
|
+
start_time = i * segment_length_s
|
|
46
|
+
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
47
47
|
|
|
48
48
|
# Extract segment
|
|
49
|
-
segment = audio[start_time:end_time]
|
|
50
|
-
|
|
51
|
-
# Generate output filename
|
|
52
49
|
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
53
50
|
output_path = os.path.join(output_dir, output_filename)
|
|
54
51
|
|
|
55
52
|
# Export segment
|
|
56
|
-
|
|
53
|
+
extract_audio(input_file_abs, output_path, start_time, end_time)
|
|
54
|
+
|
|
57
55
|
output_files.append(output_path)
|
|
58
56
|
|
|
59
57
|
logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
|
|
@@ -66,43 +64,87 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
66
64
|
)
|
|
67
65
|
|
|
68
66
|
|
|
67
|
+
def extract_audio(input_file: str, output_file: str, start_time: float = None, end_time: float = None) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Extract audio from a video or audio file and save it as an MP3 file.
|
|
70
|
+
If start_time and end_time are provided, only that segment of audio is extracted.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
input_file (str): Path to the input video or audio file.
|
|
74
|
+
output_file (str): Path where the output MP3 file will be saved.
|
|
75
|
+
start_time (float, optional): Start time of the audio segment in seconds. Defaults to None.
|
|
76
|
+
end_time (float, optional): End time of the audio segment in seconds. Defaults to None.
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
# Load the file as an AudioFileClip
|
|
80
|
+
audio_clip = AudioFileClip(input_file)
|
|
81
|
+
|
|
82
|
+
# If start_time and end_time are provided, trim the audio
|
|
83
|
+
if start_time is not None and end_time is not None:
|
|
84
|
+
audio_clip = audio_clip.cutout(0, start_time).cutout(end_time - start_time, audio_clip.duration)
|
|
85
|
+
elif start_time is not None:
|
|
86
|
+
audio_clip = audio_clip.cutout(0, start_time)
|
|
87
|
+
elif end_time is not None:
|
|
88
|
+
audio_clip = audio_clip.cutout(end_time, audio_clip.duration)
|
|
89
|
+
|
|
90
|
+
# Export the audio as MP3
|
|
91
|
+
audio_clip.write_audiofile(output_file, codec='mp3')
|
|
92
|
+
audio_clip.close()
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"Error extracting audio: {str(e)}")
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
|
|
69
98
|
async def transcribe_audio_segment(audio_file, model):
|
|
70
99
|
"""Transcribe a single audio segment asynchronously"""
|
|
71
100
|
return (await model.atranscribe(audio_file)).text
|
|
72
101
|
|
|
73
102
|
|
|
74
|
-
async def
|
|
103
|
+
async def extract_audio_data(data: ProcessSourceState):
|
|
75
104
|
input_audio_path = data.file_path
|
|
76
|
-
audio_files = []
|
|
77
105
|
|
|
78
106
|
try:
|
|
79
|
-
#
|
|
80
|
-
|
|
107
|
+
# Create a temporary directory for audio segments
|
|
108
|
+
temp_dir = tempfile.mkdtemp()
|
|
109
|
+
output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
|
|
110
|
+
output_dir = temp_dir
|
|
111
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
81
112
|
|
|
82
|
-
#
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
113
|
+
# Split audio into segments if longer than 10 minutes
|
|
114
|
+
audio = AudioFileClip(input_audio_path)
|
|
115
|
+
duration_s = audio.duration
|
|
116
|
+
segment_length_s = 10 * 60 # 10 minutes in seconds
|
|
117
|
+
output_files = []
|
|
118
|
+
|
|
119
|
+
if duration_s > segment_length_s:
|
|
120
|
+
logger.info(f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments")
|
|
121
|
+
for i in range(math.ceil(duration_s / segment_length_s)):
|
|
122
|
+
start_time = i * segment_length_s
|
|
123
|
+
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
89
124
|
|
|
90
|
-
|
|
125
|
+
# Extract segment
|
|
126
|
+
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
127
|
+
output_path = os.path.join(output_dir, output_filename)
|
|
91
128
|
|
|
129
|
+
extract_audio(input_audio_path, output_path, start_time, end_time)
|
|
130
|
+
|
|
131
|
+
output_files.append(output_path)
|
|
132
|
+
else:
|
|
133
|
+
output_files = [input_audio_path]
|
|
134
|
+
|
|
135
|
+
# Transcribe audio files
|
|
136
|
+
from content_core.models import ModelFactory
|
|
137
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
138
|
+
transcriptions = []
|
|
139
|
+
for audio_file in output_files:
|
|
140
|
+
transcription = await transcribe_audio_segment(audio_file, speech_to_text_model)
|
|
141
|
+
transcriptions.append(transcription)
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"metadata": {"audio_files": output_files},
|
|
145
|
+
"content": " ".join(transcriptions)
|
|
146
|
+
}
|
|
92
147
|
except Exception as e:
|
|
93
|
-
logger.error(f"Error
|
|
94
|
-
logger.
|
|
148
|
+
logger.error(f"Error processing audio: {str(e)}")
|
|
149
|
+
logger.error(traceback.format_exc())
|
|
95
150
|
raise
|
|
96
|
-
|
|
97
|
-
finally:
|
|
98
|
-
# Clean up temporary files
|
|
99
|
-
def _cleanup(files):
|
|
100
|
-
for file in files:
|
|
101
|
-
try:
|
|
102
|
-
os.remove(file)
|
|
103
|
-
except OSError as e:
|
|
104
|
-
logger.error(f"Error removing temporary file {file}: {str(e)}")
|
|
105
|
-
|
|
106
|
-
await asyncio.get_event_loop().run_in_executor(
|
|
107
|
-
None, partial(_cleanup, audio_files)
|
|
108
|
-
)
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
from typing import Dict, Optional, Union
|
|
2
2
|
|
|
3
|
+
from ai_prompter import Prompter
|
|
3
4
|
from esperanto import LanguageModel
|
|
4
5
|
from esperanto.common_types import Message
|
|
5
6
|
from pydantic import BaseModel, Field
|
|
6
7
|
|
|
7
8
|
from content_core.models import ModelFactory
|
|
8
|
-
from content_core.prompter import Prompter
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TemplatedMessageInput(BaseModel):
|
|
12
|
-
system_prompt_template: Optional[str] =
|
|
13
|
-
system_prompt_text: Optional[str] =
|
|
14
|
-
user_prompt_template: Optional[str] =
|
|
15
|
-
user_prompt_text: Optional[str] =
|
|
12
|
+
system_prompt_template: Optional[str] = None
|
|
13
|
+
system_prompt_text: Optional[str] = None
|
|
14
|
+
user_prompt_template: Optional[str] = None
|
|
15
|
+
user_prompt_text: Optional[str] = None
|
|
16
16
|
data: Optional[Union[Dict, BaseModel]] = Field(default_factory=lambda: {})
|
|
17
17
|
config: Dict = Field(
|
|
18
18
|
description="The config for the LLM",
|
|
@@ -28,30 +28,22 @@ async def templated_message(
|
|
|
28
28
|
input: TemplatedMessageInput, model: Optional[LanguageModel] = None
|
|
29
29
|
) -> str:
|
|
30
30
|
if not model:
|
|
31
|
-
model = ModelFactory.get_model(
|
|
31
|
+
model = ModelFactory.get_model("default_model")
|
|
32
32
|
|
|
33
33
|
msgs = []
|
|
34
34
|
if input.system_prompt_template or input.system_prompt_text:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
prompt_text=input.system_prompt_text,
|
|
41
|
-
).render(data=input.data),
|
|
42
|
-
)
|
|
43
|
-
)
|
|
35
|
+
system_prompt = Prompter(
|
|
36
|
+
prompt_template=input.system_prompt_template,
|
|
37
|
+
template_text=input.system_prompt_text,
|
|
38
|
+
).render(data=input.data)
|
|
39
|
+
msgs.append(Message(role="system", content=system_prompt))
|
|
44
40
|
|
|
45
41
|
if input.user_prompt_template or input.user_prompt_text:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
prompt_text=input.user_prompt_text,
|
|
52
|
-
).render(data=input.data),
|
|
53
|
-
)
|
|
54
|
-
)
|
|
42
|
+
user_prompt = Prompter(
|
|
43
|
+
prompt_template=input.user_prompt_template,
|
|
44
|
+
template_text=input.user_prompt_text,
|
|
45
|
+
).render(data=input.data)
|
|
46
|
+
msgs.append(Message(role="user", content=user_prompt))
|
|
55
47
|
|
|
56
48
|
result = await model.achat_complete(msgs)
|
|
57
49
|
return result.content
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: ai-prompter>=0.2.3
|
|
8
9
|
Requires-Dist: aiohttp>=3.11
|
|
9
10
|
Requires-Dist: bs4>=0.0.2
|
|
10
11
|
Requires-Dist: dicttoxml>=1.7.16
|
|
@@ -14,10 +15,10 @@ Requires-Dist: jinja2>=3.1.6
|
|
|
14
15
|
Requires-Dist: langdetect>=1.0.9
|
|
15
16
|
Requires-Dist: langgraph>=0.3.29
|
|
16
17
|
Requires-Dist: loguru>=0.7.3
|
|
18
|
+
Requires-Dist: moviepy>=2.1.2
|
|
17
19
|
Requires-Dist: openai>=1.73.0
|
|
18
20
|
Requires-Dist: openpyxl>=3.1.5
|
|
19
21
|
Requires-Dist: pandas>=2.2.3
|
|
20
|
-
Requires-Dist: pydub>=0.25.1
|
|
21
22
|
Requires-Dist: pymupdf>=1.25.5
|
|
22
23
|
Requires-Dist: python-docx>=1.1.2
|
|
23
24
|
Requires-Dist: python-dotenv>=1.1.0
|
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
content_core/__init__.py,sha256=
|
|
1
|
+
content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
|
|
2
2
|
content_core/cc_config.yaml,sha256=w66fo5ut6TPaU3o4hkjnroqg2hkr8YuOG3BRtI50j1s,701
|
|
3
3
|
content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
|
|
6
6
|
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
7
|
-
content_core/prompter.py,sha256=-ShuSyHvK50xlgsAFfA9AnAJV-LlzWwmbPDq2wUZRcI,5793
|
|
8
7
|
content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
9
|
-
content_core/templated_message.py,sha256=
|
|
8
|
+
content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
|
|
10
9
|
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
11
10
|
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
12
11
|
content_core/common/state.py,sha256=cJvIwqvrvGxuk1t51bTOvPV-RM5Nbd8F8C4o0dawIXo,1185
|
|
@@ -15,12 +14,11 @@ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCr
|
|
|
15
14
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
16
15
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
17
16
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
|
-
content_core/content/extraction/graph.py,sha256=
|
|
17
|
+
content_core/content/extraction/graph.py,sha256=d5Hp7GS2dFpYQIHFTIFhU-7ySZ3lfipdDxZZpe2DXS8,6361
|
|
19
18
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
20
19
|
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
21
|
-
content_core/notebooks/
|
|
22
|
-
content_core/
|
|
23
|
-
content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
|
|
20
|
+
content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
|
|
21
|
+
content_core/processors/audio.py,sha256=KnwxK85X9qRyVziMhFd103kfHkE8qGB1D4yW5lYO90E,5701
|
|
24
22
|
content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
|
|
25
23
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
26
24
|
content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
|
|
@@ -28,14 +26,12 @@ content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz
|
|
|
28
26
|
content_core/processors/url.py,sha256=yhAnvIlYKc13iZedwA0ck6h6wd2j6T-Q2NAtMen3hIs,6783
|
|
29
27
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
30
28
|
content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
|
|
31
|
-
content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
|
|
32
|
-
content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
|
|
33
29
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
34
30
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
35
31
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
36
32
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
37
|
-
content_core-0.
|
|
38
|
-
content_core-0.
|
|
39
|
-
content_core-0.
|
|
40
|
-
content_core-0.
|
|
41
|
-
content_core-0.
|
|
33
|
+
content_core-0.6.0.dist-info/METADATA,sha256=pn72ciBGpWE7tVvJ2j3NmQPmFB60cNrkHBmp5ziuyqk,10534
|
|
34
|
+
content_core-0.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
35
|
+
content_core-0.6.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
36
|
+
content_core-0.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
37
|
+
content_core-0.6.0.dist-info/RECORD,,
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"cells": [
|
|
3
|
-
{
|
|
4
|
-
"cell_type": "code",
|
|
5
|
-
"execution_count": null,
|
|
6
|
-
"metadata": {},
|
|
7
|
-
"outputs": [],
|
|
8
|
-
"source": [
|
|
9
|
-
"from docling.document_converter import DocumentConverter\n",
|
|
10
|
-
"\n",
|
|
11
|
-
"\n",
|
|
12
|
-
"source = \"/Users/luisnovo/dev/projetos/content-core/tests/input_content/file.docx\"\n",
|
|
13
|
-
"source_url = \"https://arxiv.org/pdf/2408.09869\" # PDF path or URL\n",
|
|
14
|
-
"converter = DocumentConverter()\n",
|
|
15
|
-
"result = converter.convert(source)\n",
|
|
16
|
-
"print(result.document.export_to_markdown())"
|
|
17
|
-
]
|
|
18
|
-
}
|
|
19
|
-
],
|
|
20
|
-
"metadata": {
|
|
21
|
-
"language_info": {
|
|
22
|
-
"name": "python"
|
|
23
|
-
}
|
|
24
|
-
},
|
|
25
|
-
"nbformat": 4,
|
|
26
|
-
"nbformat_minor": 2
|
|
27
|
-
}
|
content_core/prompter.py
DELETED
|
@@ -1,159 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
A prompt management module using Jinja to generate complex prompts with simple templates.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import os
|
|
6
|
-
from dataclasses import dataclass
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
from typing import Any, Dict, Optional, Union
|
|
9
|
-
|
|
10
|
-
from dotenv import load_dotenv
|
|
11
|
-
from jinja2 import Environment, FileSystemLoader, Template
|
|
12
|
-
from langchain_core.prompts import ChatPromptTemplate
|
|
13
|
-
from pydantic import BaseModel
|
|
14
|
-
|
|
15
|
-
from content_core.logging import logger
|
|
16
|
-
|
|
17
|
-
load_dotenv()
|
|
18
|
-
|
|
19
|
-
prompt_path_default = os.path.join(
|
|
20
|
-
os.path.dirname(os.path.abspath(__file__)), "prompts"
|
|
21
|
-
)
|
|
22
|
-
prompt_path_custom = os.getenv("PROMPT_PATH")
|
|
23
|
-
|
|
24
|
-
logger.debug(
|
|
25
|
-
f"Pasta de prompts personalizada: {prompt_path_custom if prompt_path_custom else 'Não definida'}"
|
|
26
|
-
)
|
|
27
|
-
logger.debug(f"Pasta de prompts padrão: {prompt_path_default}")
|
|
28
|
-
|
|
29
|
-
env_custom = (
|
|
30
|
-
Environment(loader=FileSystemLoader(prompt_path_custom))
|
|
31
|
-
if prompt_path_custom and os.path.exists(prompt_path_custom)
|
|
32
|
-
else None
|
|
33
|
-
)
|
|
34
|
-
env_default = Environment(loader=FileSystemLoader(prompt_path_default))
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@dataclass
|
|
38
|
-
class Prompter:
|
|
39
|
-
"""
|
|
40
|
-
A class for managing and rendering prompt templates.
|
|
41
|
-
|
|
42
|
-
Attributes:
|
|
43
|
-
prompt_template (str, optional): The name of the prompt template file.
|
|
44
|
-
prompt_variation (str, optional): The variation of the prompt template.
|
|
45
|
-
prompt_text (str, optional): The raw prompt text.
|
|
46
|
-
template (Union[str, Template], optional): The Jinja2 template object.
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
prompt_template: Optional[str] = None
|
|
50
|
-
prompt_variation: Optional[str] = "default"
|
|
51
|
-
prompt_text: Optional[str] = None
|
|
52
|
-
template: Optional[Union[str, Template]] = None
|
|
53
|
-
parser: Optional[Any] = None
|
|
54
|
-
|
|
55
|
-
def __init__(self, prompt_template=None, prompt_text=None, parser=None):
|
|
56
|
-
"""
|
|
57
|
-
Initialize the Prompter with either a template file or raw text.
|
|
58
|
-
|
|
59
|
-
Args:
|
|
60
|
-
prompt_template (str, optional): The name of the prompt template file.
|
|
61
|
-
prompt_text (str, optional): The raw prompt text.
|
|
62
|
-
"""
|
|
63
|
-
self.prompt_template = prompt_template
|
|
64
|
-
self.prompt_text = prompt_text
|
|
65
|
-
self.parser = parser
|
|
66
|
-
self.setup()
|
|
67
|
-
|
|
68
|
-
def setup(self):
|
|
69
|
-
"""
|
|
70
|
-
Set up the Jinja2 template based on the provided template file or text.
|
|
71
|
-
Raises:
|
|
72
|
-
ValueError: If neither prompt_template nor prompt_text is provided.
|
|
73
|
-
"""
|
|
74
|
-
if self.prompt_template:
|
|
75
|
-
# Primeiro tenta carregar da pasta personalizada, se disponível
|
|
76
|
-
if env_custom:
|
|
77
|
-
try:
|
|
78
|
-
self.template = env_custom.get_template(
|
|
79
|
-
f"{self.prompt_template}.jinja"
|
|
80
|
-
)
|
|
81
|
-
logger.debug(
|
|
82
|
-
f"Template {self.prompt_template} carregado da pasta personalizada"
|
|
83
|
-
)
|
|
84
|
-
return
|
|
85
|
-
except Exception as e:
|
|
86
|
-
logger.debug(
|
|
87
|
-
f"Template {self.prompt_template} não encontrado na pasta personalizada: {e}"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
# Se não encontrou na personalizada ou não há pasta personalizada, tenta a padrão
|
|
91
|
-
try:
|
|
92
|
-
self.template = env_default.get_template(
|
|
93
|
-
f"{self.prompt_template}.jinja"
|
|
94
|
-
)
|
|
95
|
-
logger.debug(
|
|
96
|
-
f"Template {self.prompt_template} carregado da pasta padrão"
|
|
97
|
-
)
|
|
98
|
-
except Exception as e:
|
|
99
|
-
raise ValueError(
|
|
100
|
-
f"Template {self.prompt_template} não encontrado na pasta padrão: {e}"
|
|
101
|
-
)
|
|
102
|
-
elif self.prompt_text:
|
|
103
|
-
self.template = Template(self.prompt_text)
|
|
104
|
-
else:
|
|
105
|
-
raise ValueError("Prompter must have a prompt_template or prompt_text")
|
|
106
|
-
|
|
107
|
-
assert self.prompt_template or self.prompt_text, "Prompt is required"
|
|
108
|
-
|
|
109
|
-
def to_langchain(self):
|
|
110
|
-
if isinstance(self.template, str):
|
|
111
|
-
template_text = self.template
|
|
112
|
-
else:
|
|
113
|
-
# For file-based templates, read the raw content
|
|
114
|
-
template_path = os.path.join("prompts", f"{self.prompt_template}.jinja")
|
|
115
|
-
with open(template_path, "r") as f:
|
|
116
|
-
template_text = f.read()
|
|
117
|
-
return ChatPromptTemplate.from_template(template_text, template_format="jinja2")
|
|
118
|
-
|
|
119
|
-
@classmethod
|
|
120
|
-
def from_text(cls, text: str):
|
|
121
|
-
"""
|
|
122
|
-
Create a Prompter instance from raw text, which can contain Jinja code.
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
text (str): The raw prompt text.
|
|
126
|
-
|
|
127
|
-
Returns:
|
|
128
|
-
Prompter: A new Prompter instance.
|
|
129
|
-
"""
|
|
130
|
-
|
|
131
|
-
return cls(prompt_text=text)
|
|
132
|
-
|
|
133
|
-
def render(self, data: Optional[Union[Dict, BaseModel]] = {}) -> str:
|
|
134
|
-
"""
|
|
135
|
-
Render the prompt template with the given data.
|
|
136
|
-
|
|
137
|
-
Args:
|
|
138
|
-
data (Union[Dict, BaseModel]): The data to be used in rendering the template.
|
|
139
|
-
Can be either a dictionary or a Pydantic BaseModel.
|
|
140
|
-
|
|
141
|
-
Returns:
|
|
142
|
-
str: The rendered prompt text.
|
|
143
|
-
|
|
144
|
-
Raises:
|
|
145
|
-
AssertionError: If the template is not defined or not a Jinja2 Template.
|
|
146
|
-
"""
|
|
147
|
-
# Convert Pydantic model to dict if necessary
|
|
148
|
-
data_dict = data.model_dump() if isinstance(data, BaseModel) else data
|
|
149
|
-
# Create a new mutable dictionary with the original data
|
|
150
|
-
render_data = dict(data_dict)
|
|
151
|
-
render_data["current_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
152
|
-
if self.parser:
|
|
153
|
-
render_data["format_instructions"] = self.parser.get_format_instructions()
|
|
154
|
-
assert self.template, "Prompter template is not defined"
|
|
155
|
-
assert isinstance(
|
|
156
|
-
self.template, Template
|
|
157
|
-
), "Prompter template is not a Jinja2 Template"
|
|
158
|
-
return self.template.render(render_data)
|
|
159
|
-
return self.template.render(render_data)
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
# GOAL
|
|
2
|
-
|
|
3
|
-
Adjust the content below to make it clean and readable:
|
|
4
|
-
Remove repeated strings that do not add value to the text.
|
|
5
|
-
|
|
6
|
-
Remove any content unrelated to the text itself (e.g., metadata, artifacts, or extraction errors).
|
|
7
|
-
|
|
8
|
-
Format the output as unstructured but clear text.
|
|
9
|
-
|
|
10
|
-
Do not add extra text, introductions, conclusions, or commentary—only rewrite the provided content as it is.
|
|
11
|
-
|
|
12
|
-
Do not interpret, analyze, or alter the meaning, intent, or narrative of the text—just reformat it for clarity and readability.
|
|
13
|
-
|
|
14
|
-
Do not change the text structure, do not write conclusions about it. Your only job is to make it readable.
|
|
15
|
-
|
|
16
|
-
Keep the text in its original language, regardless of what it is.
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
You are an AI assistant for a personal study platform.
|
|
2
|
-
|
|
3
|
-
In this platform, your user collects various articles and content from the Internet for reference and study.
|
|
4
|
-
|
|
5
|
-
Your role is to summarize the selected content as densely as possible, helping the reader extract maximum value from it without reading the full text.
|
|
6
|
-
Focus solely on the content's value, avoiding unnecessary comments or messages.
|
|
7
|
-
|
|
8
|
-
The summary should be dense, rich in characters, and designed to create a powerful vector representation.
|
|
9
|
-
If the user provided additional context, follow its instructions. Otherwise, summary the whole content.
|
|
10
|
-
|
|
11
|
-
Do not return any acknowledgments or greetings—only the summary.
|
|
12
|
-
|
|
13
|
-
CONTENT:
|
|
14
|
-
|
|
15
|
-
{{ content }}
|
|
16
|
-
|
|
17
|
-
{% if context %}
|
|
18
|
-
CONTEXT:
|
|
19
|
-
|
|
20
|
-
User has provided the aditional context for your task:
|
|
21
|
-
{{context}}
|
|
22
|
-
{% endif%}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
SUMMARY:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|