npcpy 1.2.33__py3-none-any.whl → 1.2.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- npcpy/data/audio.py +35 -1
- npcpy/data/load.py +149 -7
- npcpy/data/video.py +72 -0
- npcpy/ft/diff.py +332 -71
- npcpy/gen/image_gen.py +120 -23
- npcpy/gen/ocr.py +187 -0
- npcpy/memory/command_history.py +257 -41
- npcpy/npc_compiler.py +102 -157
- npcpy/serve.py +1469 -739
- {npcpy-1.2.33.dist-info → npcpy-1.2.35.dist-info}/METADATA +1 -1
- {npcpy-1.2.33.dist-info → npcpy-1.2.35.dist-info}/RECORD +14 -13
- {npcpy-1.2.33.dist-info → npcpy-1.2.35.dist-info}/WHEEL +0 -0
- {npcpy-1.2.33.dist-info → npcpy-1.2.35.dist-info}/licenses/LICENSE +0 -0
- {npcpy-1.2.33.dist-info → npcpy-1.2.35.dist-info}/top_level.txt +0 -0
npcpy/data/audio.py
CHANGED
|
@@ -175,6 +175,41 @@ def run_transcription(audio_np):
|
|
|
175
175
|
return None
|
|
176
176
|
|
|
177
177
|
|
|
178
|
+
def transcribe_audio_file(file_path: str, language=None) -> str:
|
|
179
|
+
"""
|
|
180
|
+
File-based transcription helper that prefers the local faster-whisper/whisper
|
|
181
|
+
setup used elsewhere in this module.
|
|
182
|
+
"""
|
|
183
|
+
# Try faster-whisper first
|
|
184
|
+
try:
|
|
185
|
+
from faster_whisper import WhisperModel # type: ignore
|
|
186
|
+
try:
|
|
187
|
+
import torch # type: ignore
|
|
188
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
189
|
+
except Exception:
|
|
190
|
+
device = "cpu"
|
|
191
|
+
model = WhisperModel("small", device=device)
|
|
192
|
+
segments, _ = model.transcribe(file_path, language=language, beam_size=5)
|
|
193
|
+
text = " ".join(seg.text.strip() for seg in segments if seg.text).strip()
|
|
194
|
+
if text:
|
|
195
|
+
return text
|
|
196
|
+
except Exception:
|
|
197
|
+
pass
|
|
198
|
+
|
|
199
|
+
# Fallback to openai/whisper if available
|
|
200
|
+
try:
|
|
201
|
+
import whisper # type: ignore
|
|
202
|
+
model = whisper.load_model("small")
|
|
203
|
+
result = model.transcribe(file_path, language=language)
|
|
204
|
+
text = result.get("text", "").strip()
|
|
205
|
+
if text:
|
|
206
|
+
return text
|
|
207
|
+
except Exception:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
return ""
|
|
211
|
+
|
|
212
|
+
|
|
178
213
|
|
|
179
214
|
def load_history():
|
|
180
215
|
global history
|
|
@@ -431,4 +466,3 @@ def process_text_for_tts(text):
|
|
|
431
466
|
text = re.sub(r"([.!?])(\w)", r"\1 \2", text)
|
|
432
467
|
return text
|
|
433
468
|
|
|
434
|
-
|
npcpy/data/load.py
CHANGED
|
@@ -4,8 +4,10 @@ import json
|
|
|
4
4
|
import io
|
|
5
5
|
from PIL import Image
|
|
6
6
|
import numpy as np
|
|
7
|
-
from typing import Optional
|
|
7
|
+
from typing import Optional, List
|
|
8
8
|
import os
|
|
9
|
+
import tempfile
|
|
10
|
+
import subprocess
|
|
9
11
|
|
|
10
12
|
try:
|
|
11
13
|
from docx import Document
|
|
@@ -90,12 +92,17 @@ extension_map = {
|
|
|
90
92
|
"JPEG": "images",
|
|
91
93
|
"GIF": "images",
|
|
92
94
|
"SVG": "images",
|
|
95
|
+
"WEBP": "images",
|
|
96
|
+
"BMP": "images",
|
|
97
|
+
"TIFF": "images",
|
|
93
98
|
"MP4": "videos",
|
|
94
99
|
"AVI": "videos",
|
|
95
100
|
"MOV": "videos",
|
|
96
101
|
"WMV": "videos",
|
|
97
102
|
"MPG": "videos",
|
|
98
103
|
"MPEG": "videos",
|
|
104
|
+
"WEBM": "videos",
|
|
105
|
+
"MKV": "videos",
|
|
99
106
|
"DOCX": "documents",
|
|
100
107
|
"PPTX": "documents",
|
|
101
108
|
"PDF": "documents",
|
|
@@ -105,6 +112,12 @@ extension_map = {
|
|
|
105
112
|
"MD": "documents",
|
|
106
113
|
"HTML": "documents",
|
|
107
114
|
"HTM": "documents",
|
|
115
|
+
"MP3": "audio",
|
|
116
|
+
"WAV": "audio",
|
|
117
|
+
"M4A": "audio",
|
|
118
|
+
"AAC": "audio",
|
|
119
|
+
"FLAC": "audio",
|
|
120
|
+
"OGG": "audio",
|
|
108
121
|
"ZIP": "archives",
|
|
109
122
|
"RAR": "archives",
|
|
110
123
|
"7Z": "archives",
|
|
@@ -112,6 +125,136 @@ extension_map = {
|
|
|
112
125
|
"GZ": "archives",
|
|
113
126
|
}
|
|
114
127
|
|
|
128
|
+
def _chunk_text(full_content: str, chunk_size: int) -> List[str]:
|
|
129
|
+
"""Split long content into reasonably sized chunks for model input."""
|
|
130
|
+
chunks = []
|
|
131
|
+
for i in range(0, len(full_content), chunk_size):
|
|
132
|
+
chunk = full_content[i:i+chunk_size].strip()
|
|
133
|
+
if chunk:
|
|
134
|
+
chunks.append(chunk)
|
|
135
|
+
return chunks
|
|
136
|
+
|
|
137
|
+
def _transcribe_audio(file_path: str, language: Optional[str] = None) -> str:
|
|
138
|
+
"""
|
|
139
|
+
Best-effort audio transcription using optional dependencies.
|
|
140
|
+
Tries faster-whisper, then openai/whisper. Falls back to metadata only.
|
|
141
|
+
"""
|
|
142
|
+
# Prefer the existing audio module helper if present
|
|
143
|
+
try:
|
|
144
|
+
from npcpy.data.audio import transcribe_audio_file # type: ignore
|
|
145
|
+
text = transcribe_audio_file(file_path, language=language)
|
|
146
|
+
if text:
|
|
147
|
+
return text
|
|
148
|
+
except Exception:
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
# Try faster-whisper first
|
|
152
|
+
try:
|
|
153
|
+
from faster_whisper import WhisperModel
|
|
154
|
+
try:
|
|
155
|
+
import torch
|
|
156
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
157
|
+
except Exception:
|
|
158
|
+
device = "cpu"
|
|
159
|
+
model = WhisperModel("small", device=device)
|
|
160
|
+
segments, _ = model.transcribe(file_path, language=language, beam_size=5)
|
|
161
|
+
return " ".join(seg.text.strip() for seg in segments if seg.text).strip()
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
# Fallback: openai/whisper
|
|
166
|
+
try:
|
|
167
|
+
import whisper
|
|
168
|
+
model = whisper.load_model("small")
|
|
169
|
+
result = model.transcribe(file_path, language=language)
|
|
170
|
+
return result.get("text", "").strip()
|
|
171
|
+
except Exception:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
# Last resort metadata message
|
|
175
|
+
return f"[Audio file at {file_path}; install faster-whisper or whisper for transcription]"
|
|
176
|
+
|
|
177
|
+
def load_audio(file_path: str, language: Optional[str] = None) -> str:
|
|
178
|
+
"""Load and transcribe an audio file into text."""
|
|
179
|
+
transcript = _transcribe_audio(file_path, language=language)
|
|
180
|
+
if transcript:
|
|
181
|
+
return transcript
|
|
182
|
+
return f"[Audio file at {file_path}; no transcript available]"
|
|
183
|
+
|
|
184
|
+
def _extract_audio_from_video(file_path: str, max_duration: int = 600) -> Optional[str]:
|
|
185
|
+
"""
|
|
186
|
+
Use ffmpeg to dump the audio track from a video into a temp wav for transcription.
|
|
187
|
+
Returns the temp path or None.
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
191
|
+
temp_audio.close()
|
|
192
|
+
cmd = [
|
|
193
|
+
"ffmpeg",
|
|
194
|
+
"-y",
|
|
195
|
+
"-i",
|
|
196
|
+
file_path,
|
|
197
|
+
"-vn",
|
|
198
|
+
"-ac",
|
|
199
|
+
"1",
|
|
200
|
+
"-ar",
|
|
201
|
+
"16000",
|
|
202
|
+
"-t",
|
|
203
|
+
str(max_duration),
|
|
204
|
+
temp_audio.name,
|
|
205
|
+
]
|
|
206
|
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
207
|
+
return temp_audio.name
|
|
208
|
+
except Exception:
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
def load_video(file_path: str, language: Optional[str] = None, max_audio_seconds: int = 600) -> str:
|
|
212
|
+
"""
|
|
213
|
+
Summarize a video by reporting metadata and (optionally) transcribing its audio track.
|
|
214
|
+
"""
|
|
215
|
+
# Prefer the video module helper if present
|
|
216
|
+
try:
|
|
217
|
+
from npcpy.data.video import summarize_video_file # type: ignore
|
|
218
|
+
return summarize_video_file(file_path, language=language, max_audio_seconds=max_audio_seconds)
|
|
219
|
+
except Exception:
|
|
220
|
+
pass
|
|
221
|
+
|
|
222
|
+
# Fallback to minimal summary/transcription
|
|
223
|
+
meta_bits = []
|
|
224
|
+
try:
|
|
225
|
+
import cv2
|
|
226
|
+
video = cv2.VideoCapture(file_path)
|
|
227
|
+
fps = video.get(cv2.CAP_PROP_FPS)
|
|
228
|
+
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
229
|
+
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
230
|
+
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
231
|
+
duration = frame_count / fps if fps else 0
|
|
232
|
+
meta_bits.append(
|
|
233
|
+
f"Video file: {os.path.basename(file_path)} | {width}x{height} | {fps:.2f} fps | {frame_count} frames | ~{duration:.1f}s"
|
|
234
|
+
)
|
|
235
|
+
video.release()
|
|
236
|
+
except Exception:
|
|
237
|
+
meta_bits.append(f"Video file: {os.path.basename(file_path)}")
|
|
238
|
+
|
|
239
|
+
audio_path = _extract_audio_from_video(file_path, max_duration=max_audio_seconds)
|
|
240
|
+
transcript = ""
|
|
241
|
+
if audio_path:
|
|
242
|
+
try:
|
|
243
|
+
transcript = _transcribe_audio(audio_path, language=language)
|
|
244
|
+
finally:
|
|
245
|
+
try:
|
|
246
|
+
os.remove(audio_path)
|
|
247
|
+
except Exception:
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
if transcript:
|
|
251
|
+
meta_bits.append("Audio transcript:")
|
|
252
|
+
meta_bits.append(transcript)
|
|
253
|
+
else:
|
|
254
|
+
meta_bits.append("[No transcript extracted; ensure ffmpeg and faster-whisper/whisper are installed]")
|
|
255
|
+
|
|
256
|
+
return "\n".join(meta_bits)
|
|
257
|
+
|
|
115
258
|
def load_file_contents(file_path, chunk_size=None):
|
|
116
259
|
file_ext = os.path.splitext(file_path)[1].upper().lstrip('.')
|
|
117
260
|
full_content = ""
|
|
@@ -137,18 +280,17 @@ def load_file_contents(file_path, chunk_size=None):
|
|
|
137
280
|
elif file_ext == 'JSON':
|
|
138
281
|
data = load_json(file_path)
|
|
139
282
|
full_content = json.dumps(data, indent=2)
|
|
283
|
+
elif file_ext in ['MP3', 'WAV', 'M4A', 'AAC', 'FLAC', 'OGG']:
|
|
284
|
+
full_content = load_audio(file_path)
|
|
285
|
+
elif file_ext in ['MP4', 'AVI', 'MOV', 'WMV', 'MPG', 'MPEG', 'WEBM', 'MKV']:
|
|
286
|
+
full_content = load_video(file_path)
|
|
140
287
|
else:
|
|
141
288
|
return [f"Unsupported file format for content loading: {file_ext}"]
|
|
142
289
|
|
|
143
290
|
if not full_content:
|
|
144
291
|
return []
|
|
145
292
|
|
|
146
|
-
|
|
147
|
-
for i in range(0, len(full_content), chunk_size):
|
|
148
|
-
chunk = full_content[i:i+chunk_size].strip()
|
|
149
|
-
if chunk:
|
|
150
|
-
chunks.append(chunk)
|
|
151
|
-
return chunks
|
|
293
|
+
return _chunk_text(full_content, chunk_size)
|
|
152
294
|
|
|
153
295
|
except Exception as e:
|
|
154
296
|
return [f"Error loading file {file_path}: {str(e)}"]
|
npcpy/data/video.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
|
|
2
|
+
import os
|
|
3
|
+
import tempfile
|
|
4
|
+
import subprocess
|
|
2
5
|
|
|
3
6
|
|
|
4
7
|
def process_video(file_path, table_name):
|
|
@@ -26,3 +29,72 @@ def process_video(file_path, table_name):
|
|
|
26
29
|
except Exception as e:
|
|
27
30
|
print(f"Error processing video: {e}")
|
|
28
31
|
return [], []
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def summarize_video_file(file_path: str, language: str = None, max_audio_seconds: int = 600) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Summarize a video using lightweight metadata plus optional audio transcript.
|
|
37
|
+
Prefers the audio transcription helper in npcpy.data.audio when available.
|
|
38
|
+
"""
|
|
39
|
+
meta_bits = []
|
|
40
|
+
try:
|
|
41
|
+
import cv2 # type: ignore
|
|
42
|
+
|
|
43
|
+
video = cv2.VideoCapture(file_path)
|
|
44
|
+
fps = video.get(cv2.CAP_PROP_FPS)
|
|
45
|
+
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
46
|
+
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
47
|
+
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
48
|
+
duration = frame_count / fps if fps else 0
|
|
49
|
+
meta_bits.append(
|
|
50
|
+
f"Video file: {os.path.basename(file_path)} | {width}x{height} | {fps:.2f} fps | {frame_count} frames | ~{duration:.1f}s"
|
|
51
|
+
)
|
|
52
|
+
video.release()
|
|
53
|
+
except Exception:
|
|
54
|
+
meta_bits.append(f"Video file: {os.path.basename(file_path)}")
|
|
55
|
+
|
|
56
|
+
# Extract audio track with ffmpeg if available
|
|
57
|
+
audio_path = None
|
|
58
|
+
try:
|
|
59
|
+
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
|
60
|
+
temp_audio.close()
|
|
61
|
+
cmd = [
|
|
62
|
+
"ffmpeg",
|
|
63
|
+
"-y",
|
|
64
|
+
"-i",
|
|
65
|
+
file_path,
|
|
66
|
+
"-vn",
|
|
67
|
+
"-ac",
|
|
68
|
+
"1",
|
|
69
|
+
"-ar",
|
|
70
|
+
"16000",
|
|
71
|
+
"-t",
|
|
72
|
+
str(max_audio_seconds),
|
|
73
|
+
temp_audio.name,
|
|
74
|
+
]
|
|
75
|
+
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
76
|
+
audio_path = temp_audio.name
|
|
77
|
+
except Exception:
|
|
78
|
+
audio_path = None
|
|
79
|
+
|
|
80
|
+
transcript = ""
|
|
81
|
+
if audio_path:
|
|
82
|
+
try:
|
|
83
|
+
try:
|
|
84
|
+
from npcpy.data.audio import transcribe_audio_file
|
|
85
|
+
transcript = transcribe_audio_file(audio_path, language=language) # type: ignore
|
|
86
|
+
except Exception:
|
|
87
|
+
transcript = ""
|
|
88
|
+
finally:
|
|
89
|
+
try:
|
|
90
|
+
os.remove(audio_path)
|
|
91
|
+
except Exception:
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
if transcript:
|
|
95
|
+
meta_bits.append("Audio transcript:")
|
|
96
|
+
meta_bits.append(transcript)
|
|
97
|
+
else:
|
|
98
|
+
meta_bits.append("[No transcript extracted; ensure ffmpeg and a transcription backend are installed]")
|
|
99
|
+
|
|
100
|
+
return "\n".join(meta_bits)
|