gptmed 0.5.5__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,415 @@
1
+ """
2
+ Video data preprocessing and cleaning module
3
+
4
+ Handles video frame extraction, resizing, quality checks, and metadata extraction
5
+ """
6
+
7
+ import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ from pathlib import Path
10
+ import json
11
+
12
+ from ..base import BaseDataPreprocessor, PreprocessingConfig
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class VideoPreprocessor(BaseDataPreprocessor):
19
+ """
20
+ Video preprocessing with frame extraction, resizing, and validation
21
+
22
+ Features:
23
+ - Video format validation
24
+ - Frame extraction at specified intervals
25
+ - Resolution resizing
26
+ - Frame rate conversion
27
+ - Bitrate analysis
28
+ - Duration validation
29
+ - Metadata extraction
30
+ - Codec detection
31
+ - Corruption detection
32
+ - Thumbnail generation
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ config: Optional[PreprocessingConfig] = None,
38
+ target_fps: int = 30,
39
+ target_resolution: Tuple[int, int] = (640, 480),
40
+ preserve_aspect_ratio: bool = True,
41
+ min_duration: float = 1.0, # seconds
42
+ max_duration: Optional[float] = None, # seconds
43
+ min_width: int = 320,
44
+ min_height: int = 240,
45
+ frame_extraction_interval: Optional[int] = None, # Extract every N frames
46
+ extract_frames: bool = False,
47
+ supported_formats: Optional[List[str]] = None,
48
+ ):
49
+ """
50
+ Initialize video preprocessor
51
+
52
+ Args:
53
+ config: PreprocessingConfig instance
54
+ target_fps: Target frames per second
55
+ target_resolution: Target resolution (width, height)
56
+ preserve_aspect_ratio: Whether to preserve aspect ratio
57
+ min_duration: Minimum video duration in seconds
58
+ max_duration: Maximum video duration in seconds
59
+ min_width: Minimum video width
60
+ min_height: Minimum video height
61
+ frame_extraction_interval: Extract every N frames (None = no extraction)
62
+ extract_frames: Whether to extract frames to disk
63
+ supported_formats: List of supported video formats
64
+ """
65
+ if config is None:
66
+ config = PreprocessingConfig(
67
+ input_path="./data/raw/videos",
68
+ output_path="./data/processed/videos",
69
+ data_type="video"
70
+ )
71
+
72
+ super().__init__(config)
73
+
74
+ self.target_fps = target_fps
75
+ self.target_resolution = target_resolution
76
+ self.preserve_aspect_ratio = preserve_aspect_ratio
77
+ self.min_duration = min_duration
78
+ self.max_duration = max_duration
79
+ self.min_width = min_width
80
+ self.min_height = min_height
81
+ self.frame_extraction_interval = frame_extraction_interval
82
+ self.extract_frames = extract_frames
83
+ self.supported_formats = supported_formats or ['mp4', 'avi', 'mov', 'mkv', 'flv', 'wmv']
84
+
85
+ self._import_video_library()
86
+
87
+ def _import_video_library(self):
88
+ """Attempt to import opencv and other video libraries"""
89
+ self.opencv_available = False
90
+ self.ffmpeg_available = False
91
+
92
+ try:
93
+ import cv2
94
+ self.cv2 = cv2
95
+ self.opencv_available = True
96
+ except ImportError:
97
+ self.logger.warning(
98
+ "OpenCV not available. Install with: pip install opencv-python"
99
+ )
100
+
101
+ try:
102
+ import subprocess
103
+ result = subprocess.run(['ffmpeg', '-version'], capture_output=True)
104
+ self.ffmpeg_available = result.returncode == 0
105
+ except:
106
+ self.logger.warning(
107
+ "ffmpeg not available. Install from: https://ffmpeg.org/download.html"
108
+ )
109
+
110
+ def validate(self, data: Any) -> bool:
111
+ """
112
+ Validate video input
113
+
114
+ Args:
115
+ data: Video file path (str)
116
+
117
+ Returns:
118
+ True if valid, False otherwise
119
+ """
120
+ if not self.opencv_available:
121
+ self.logger.error("OpenCV is required for video processing")
122
+ return False
123
+
124
+ try:
125
+ if not isinstance(data, str):
126
+ self.logger.warning(f"Invalid video type: {type(data)}")
127
+ return False
128
+
129
+ video_path = Path(data)
130
+ if not video_path.exists():
131
+ self.logger.warning(f"Video file not found: {data}")
132
+ return False
133
+
134
+ if not any(str(video_path).lower().endswith(f) for f in self.supported_formats):
135
+ self.logger.warning(f"Unsupported format: {data}")
136
+ return False
137
+
138
+ # Try to open video
139
+ cap = self.cv2.VideoCapture(str(video_path))
140
+ if not cap.isOpened():
141
+ self.logger.warning(f"Cannot open video: {data}")
142
+ return False
143
+
144
+ # Check properties
145
+ fps = cap.get(self.cv2.CAP_PROP_FPS)
146
+ width = int(cap.get(self.cv2.CAP_PROP_FRAME_WIDTH))
147
+ height = int(cap.get(self.cv2.CAP_PROP_FRAME_HEIGHT))
148
+ frame_count = int(cap.get(self.cv2.CAP_PROP_FRAME_COUNT))
149
+
150
+ cap.release()
151
+
152
+ # Validate dimensions
153
+ if width < self.min_width or height < self.min_height:
154
+ self.logger.warning(f"Video resolution too small: {width}x{height}")
155
+ return False
156
+
157
+ # Validate duration
158
+ duration = frame_count / fps if fps > 0 else 0
159
+ if duration < self.min_duration:
160
+ self.logger.warning(f"Video too short: {duration:.2f}s")
161
+ return False
162
+
163
+ if self.max_duration and duration > self.max_duration:
164
+ self.logger.warning(f"Video too long: {duration:.2f}s")
165
+ return False
166
+
167
+ return True
168
+
169
+ except Exception as e:
170
+ self.logger.error(f"Video validation error: {str(e)}")
171
+ return False
172
+
173
+ def clean(self, video_path: str) -> Any:
174
+ """
175
+ Clean video data (basic validation)
176
+
177
+ Args:
178
+ video_path: Path to video file
179
+
180
+ Returns:
181
+ OpenCV VideoCapture object or None
182
+ """
183
+ try:
184
+ cap = self.cv2.VideoCapture(str(video_path))
185
+ if not cap.isOpened():
186
+ raise Exception(f"Cannot open video: {video_path}")
187
+
188
+ return cap
189
+
190
+ except Exception as e:
191
+ self.logger.error(f"Video cleaning error: {str(e)}")
192
+ return None
193
+
194
+ def normalize(self, video_cap: Any) -> Any:
195
+ """
196
+ Normalize video properties
197
+
198
+ Args:
199
+ video_cap: OpenCV VideoCapture object
200
+
201
+ Returns:
202
+ VideoCapture with normalized properties
203
+ """
204
+ # Note: OpenCV doesn't allow changing FPS on the fly
205
+ # Normalization happens during frame extraction
206
+ return video_cap
207
+
208
+ def extract_frames(
209
+ self,
210
+ video_path: str,
211
+ output_dir: str,
212
+ sample_rate: int = 1,
213
+ ) -> List[str]:
214
+ """
215
+ Extract frames from video
216
+
217
+ Args:
218
+ video_path: Path to video file
219
+ output_dir: Directory to save frames
220
+ sample_rate: Extract every Nth frame
221
+
222
+ Returns:
223
+ List of extracted frame paths
224
+ """
225
+ if not self.opencv_available:
226
+ self.logger.error("OpenCV is required")
227
+ return []
228
+
229
+ try:
230
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
231
+
232
+ cap = self.cv2.VideoCapture(str(video_path))
233
+ frame_count = 0
234
+ extracted_count = 0
235
+ extracted_frames = []
236
+
237
+ while True:
238
+ ret, frame = cap.read()
239
+ if not ret:
240
+ break
241
+
242
+ if frame_count % sample_rate == 0:
243
+ # Resize frame
244
+ resized = self._resize_frame(frame)
245
+
246
+ frame_path = Path(output_dir) / f"frame_{extracted_count:06d}.jpg"
247
+ self.cv2.imwrite(str(frame_path), resized)
248
+ extracted_frames.append(str(frame_path))
249
+ extracted_count += 1
250
+
251
+ frame_count += 1
252
+
253
+ cap.release()
254
+ self.logger.info(f"Extracted {extracted_count} frames from video")
255
+
256
+ return extracted_frames
257
+
258
+ except Exception as e:
259
+ self.logger.error(f"Frame extraction error: {str(e)}")
260
+ return []
261
+
262
+ def _resize_frame(self, frame: Any) -> Any:
263
+ """
264
+ Resize a single frame while preserving aspect ratio
265
+
266
+ Args:
267
+ frame: OpenCV frame
268
+
269
+ Returns:
270
+ Resized frame
271
+ """
272
+ if self.preserve_aspect_ratio:
273
+ h, w = frame.shape[:2]
274
+ scale = min(
275
+ self.target_resolution[0] / w,
276
+ self.target_resolution[1] / h
277
+ )
278
+ new_w = int(w * scale)
279
+ new_h = int(h * scale)
280
+ resized = self.cv2.resize(frame, (new_w, new_h), interpolation=self.cv2.INTER_LANCZOS4)
281
+
282
+ # Pad to target size
283
+ top = (self.target_resolution[1] - new_h) // 2
284
+ bottom = self.target_resolution[1] - new_h - top
285
+ left = (self.target_resolution[0] - new_w) // 2
286
+ right = self.target_resolution[0] - new_w - left
287
+
288
+ padded = self.cv2.copyMakeBorder(
289
+ resized, top, bottom, left, right,
290
+ self.cv2.BORDER_CONSTANT, value=[0, 0, 0]
291
+ )
292
+ return padded
293
+ else:
294
+ return self.cv2.resize(frame, self.target_resolution, interpolation=self.cv2.INTER_LANCZOS4)
295
+
296
+ def get_video_stats(self, video_path: str) -> Dict[str, Any]:
297
+ """
298
+ Get statistics about video
299
+
300
+ Args:
301
+ video_path: Path to video file
302
+
303
+ Returns:
304
+ Dictionary with video statistics
305
+ """
306
+ try:
307
+ if not self.opencv_available:
308
+ return {}
309
+
310
+ cap = self.cv2.VideoCapture(str(video_path))
311
+
312
+ fps = cap.get(self.cv2.CAP_PROP_FPS)
313
+ width = int(cap.get(self.cv2.CAP_PROP_FRAME_WIDTH))
314
+ height = int(cap.get(self.cv2.CAP_PROP_FRAME_HEIGHT))
315
+ frame_count = int(cap.get(self.cv2.CAP_PROP_FRAME_COUNT))
316
+
317
+ cap.release()
318
+
319
+ duration = frame_count / fps if fps > 0 else 0
320
+
321
+ stats = {
322
+ 'file': str(video_path),
323
+ 'width': width,
324
+ 'height': height,
325
+ 'fps': float(fps),
326
+ 'frame_count': frame_count,
327
+ 'duration_seconds': float(duration),
328
+ 'resolution': f"{width}x{height}",
329
+ 'file_size_bytes': Path(video_path).stat().st_size,
330
+ 'aspect_ratio': width / height if height > 0 else 0,
331
+ }
332
+
333
+ return stats
334
+
335
+ except Exception as e:
336
+ self.logger.error(f"Error getting video stats: {str(e)}")
337
+ return {}
338
+
339
+ def batch_process_directory(
340
+ self,
341
+ input_dir: str,
342
+ output_dir: Optional[str] = None,
343
+ extract_frames: bool = False,
344
+ frame_sample_rate: int = 30, # Extract every 30th frame
345
+ ) -> Dict[str, Any]:
346
+ """
347
+ Process all videos in a directory
348
+
349
+ Args:
350
+ input_dir: Input directory path
351
+ output_dir: Output directory path
352
+ extract_frames: Whether to extract frames
353
+ frame_sample_rate: Sample rate for frame extraction
354
+
355
+ Returns:
356
+ Processing results
357
+ """
358
+ if not self.opencv_available:
359
+ self.logger.error("OpenCV is required")
360
+ return {'error': 'OpenCV not available'}
361
+
362
+ output_dir = output_dir or self.config.output_path
363
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
364
+
365
+ input_path = Path(input_dir)
366
+ results = []
367
+
368
+ for video_file in input_path.iterdir():
369
+ if video_file.suffix.lower()[1:] not in self.supported_formats:
370
+ continue
371
+
372
+ try:
373
+ # Validate
374
+ if not self.validate(str(video_file)):
375
+ self.stats['skipped'] += 1
376
+ continue
377
+
378
+ # Process video
379
+ cap = self.clean(str(video_file))
380
+ if cap is None:
381
+ raise Exception("Failed to open video")
382
+
383
+ normalized = self.normalize(cap)
384
+ cap.release()
385
+
386
+ # Extract frames if requested
387
+ frame_list = []
388
+ if extract_frames:
389
+ frames_dir = Path(output_dir) / video_file.stem / "frames"
390
+ frame_list = self.extract_frames(
391
+ str(video_file),
392
+ str(frames_dir),
393
+ sample_rate=frame_sample_rate
394
+ )
395
+
396
+ self.stats['output_count'] += 1
397
+
398
+ results.append({
399
+ 'file': str(video_file),
400
+ 'status': 'success',
401
+ 'frames_extracted': len(frame_list),
402
+ 'stats': self.get_video_stats(str(video_file))
403
+ })
404
+
405
+ except Exception as e:
406
+ self.logger.error(f"Error processing {video_file}: {str(e)}")
407
+ self.stats['errors'] += 1
408
+ results.append({
409
+ 'file': str(video_file),
410
+ 'status': 'error',
411
+ 'error': str(e)
412
+ })
413
+
414
+ self.logger.info(f"Processed {self.stats['output_count']} videos")
415
+ return {'results': results, 'stats': self.get_statistics()}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gptmed
3
- Version: 0.5.5
3
+ Version: 0.7.0
4
4
  Summary: A lightweight GPT-based language model framework for training custom question-answering models on any domain
5
5
  Author-email: Sanjog Sigdel <sigdelsanjog@gmail.com>, Sanjog Sigdel <sanjog.sigdel@ku.edu.np>
6
6
  License-Expression: MIT
@@ -45,6 +45,13 @@ Requires-Dist: matplotlib>=3.5.0; extra == "xai"
45
45
  Requires-Dist: seaborn>=0.12.0; extra == "xai"
46
46
  Requires-Dist: captum>=0.6.0; extra == "xai"
47
47
  Requires-Dist: scikit-learn>=1.0.0; extra == "xai"
48
+ Provides-Extra: data-preparation
49
+ Requires-Dist: pillow>=9.0.0; extra == "data-preparation"
50
+ Requires-Dist: librosa>=0.10.0; extra == "data-preparation"
51
+ Requires-Dist: soundfile>=0.12.0; extra == "data-preparation"
52
+ Requires-Dist: opencv-python>=4.5.0; extra == "data-preparation"
53
+ Requires-Dist: numpy>=1.24.0; extra == "data-preparation"
54
+ Requires-Dist: PyPDF2>=3.0.0; extra == "data-preparation"
48
55
  Dynamic: license-file
49
56
 
50
57
  # GptMed 🤖
@@ -9,6 +9,26 @@ gptmed/data/__init__.py,sha256=iAHeakB5pBAd7MkmarPPY0UKS9bTaO_winLZ23Y2O90,54
9
9
  gptmed/data/parsers/__init__.py,sha256=BgVzXuZgeE5DUCC4SzN7vflL40wQ4Q4_4DmJ1Y43_nw,211
10
10
  gptmed/data/parsers/medquad_parser.py,sha256=g3QCRiVBdcq8RdyuYH_qKFrHgU5KkHY59WfWxUwspP0,7974
11
11
  gptmed/data/parsers/text_formatter.py,sha256=tVmnDBT54BbxX9BPKMXSPzzLmM39frDxKRKuz_HoRag,4072
12
+ gptmed/data_preparation/__init__.py,sha256=FlrFDgTe64dzblCodV0vvpxbFsvZg3ewycp8yj7gxTo,988
13
+ gptmed/data_preparation/base.py,sha256=UnkdTE3rZd2LFRLqgvErhhIYpJlYQRPHcWfCaziKj9E,5360
14
+ gptmed/data_preparation/cli.py,sha256=MUUEMmZE6_FLU-xEZlxmwuVm61C-kbpP-pknieMe5vU,13570
15
+ gptmed/data_preparation/audio/__init__.py,sha256=SAzX2jmjnEXlPWB59u_EF2wEA4ENL3Z4Pz0qsitWWR4,11871
16
+ gptmed/data_preparation/image/__init__.py,sha256=GLVmPvEEeBpXFz1FahTn94PgMKvBJUzgS7hmYQ2Qp0U,9871
17
+ gptmed/data_preparation/text/__init__.py,sha256=zUINytRdxHn4BwhN0nD7QkmjBVJfGmrrN6d6fxNqBik,11470
18
+ gptmed/data_preparation/text/base_strategy.py,sha256=12L946oWKwQmp2P9Yq40F0cVU6KS6XBmNUMZLGmuo5s,634
19
+ gptmed/data_preparation/text/batch_pdf_to_jsonl.py,sha256=f0loZuq5_SNvFDtB1aeiCG1ykTPL0fqTTRX45RtbBjQ,8628
20
+ gptmed/data_preparation/text/case_normalizer.py,sha256=WefsM2dSRq39R3MjcFL-aQmCEYJvqEbrnyKxjjOGs0s,1768
21
+ gptmed/data_preparation/text/pdf_processor.py,sha256=4J1crK51_cgu6HfCwsOwHEiYPquwueRhgFAfGYGvhDA,10545
22
+ gptmed/data_preparation/text/pipeline.py,sha256=Jj999_kSxFwNe1nGICk_ALOJpjH1cRYeDkutQr8c8Ak,11583
23
+ gptmed/data_preparation/text/preprocess_jsonl.py,sha256=HIXSJjlL6QanFlhG2i7o1Xr1fpYoUy3iQkwpU5Oj1CE,16229
24
+ gptmed/data_preparation/text/punctuation_handler.py,sha256=zPfuHR0cQIIc8HCLWA7zFsI86O-fo_7oUkCPq-GkmH4,1957
25
+ gptmed/data_preparation/text/stopword_remover.py,sha256=FGf8utJ11gAjX2DqpVsa_u7ir-DHr1ybBTc9lu47wO0,2670
26
+ gptmed/data_preparation/text/text_cleaner.py,sha256=vrvt4yGK77AleDmWwRJKmflPbOYuh66AK2QStn9xHcY,2234
27
+ gptmed/data_preparation/text/text_statistics.py,sha256=-DqW4221wt_Jc1erEeoacNMcQVpzLjyO9o9ElSUMICU,4223
28
+ gptmed/data_preparation/text/tokenize_jsonl.py,sha256=54MImqrSln9aLNhi5PHEJwgXhuAIqByvSPxE3CCu854,13801
29
+ gptmed/data_preparation/text/tokenizer.py,sha256=ukweQTR3jjUvV_gYVkDYKAtEBkES9pf8oJjh4DA252k,1958
30
+ gptmed/data_preparation/text/unicode_normalizer.py,sha256=Ck5NcGggNePpMNSr-8_0YOrIo3ZgdtlBlC2fZPo3JDY,1656
31
+ gptmed/data_preparation/video/__init__.py,sha256=nKinO9FuPbjl7oJlJcbUe2qCmShuUMpZEcuRW32k_8w,14310
12
32
  gptmed/framework/__init__.py,sha256=TlzM7NS_n0KQnm9PQTJRrb5pEb6rBXC1pqGPhbSO_bQ,25
13
33
  gptmed/framework/cli/__init__.py,sha256=oBUmoaWLCjFs3_aod-hcMCcC11UP4t4SohDnZ7Sdmx0,729
14
34
  gptmed/framework/cli/__main__.py,sha256=rLBZjEi695ZgOW8pypqpg2kLgtcDhrI_9_QcrUO3WkU,103
@@ -46,9 +66,9 @@ gptmed/training/utils.py,sha256=pJxCwneNr2STITIYwIDCxRzIICDFOxOMzK8DT7ck2oQ,5651
46
66
  gptmed/utils/__init__.py,sha256=XuMhIqOXF7mjnog_6Iky-hSbwvFb0iK42B4iDUpgi0U,44
47
67
  gptmed/utils/checkpoints.py,sha256=jPKJtO0YRZieGmpwqotgDkBzd__s_raDxS1kLpfjBJE,7113
48
68
  gptmed/utils/logging.py,sha256=7dJc1tayMxCBjFSDXe4r9ACUTpoPTTGsJ0UZMTqZIDY,5303
49
- gptmed-0.5.5.dist-info/licenses/LICENSE,sha256=v2spsd7N1pKFFh2G8wGP_45iwe5S0DYiJzG4im8Rupc,1066
50
- gptmed-0.5.5.dist-info/METADATA,sha256=-7Drfeaxy2SWQ3nlnHqwP4pj1q9bsrmTxe99pIWMePk,13842
51
- gptmed-0.5.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
52
- gptmed-0.5.5.dist-info/entry_points.txt,sha256=ZZeYg2kOQuHHvRvQYRvq5L-RpClnBMHSpUom9DxQW0c,145
53
- gptmed-0.5.5.dist-info/top_level.txt,sha256=mhyEq3rG33t21ziJz5w3TPgx0RjPf4zXMNUx2JTiNmE,7
54
- gptmed-0.5.5.dist-info/RECORD,,
69
+ gptmed-0.7.0.dist-info/licenses/LICENSE,sha256=v2spsd7N1pKFFh2G8wGP_45iwe5S0DYiJzG4im8Rupc,1066
70
+ gptmed-0.7.0.dist-info/METADATA,sha256=7tYxcCOQEd5RCsIRgEa7VbNk_FdNjTMdmZfUj6-pFdk,14236
71
+ gptmed-0.7.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
72
+ gptmed-0.7.0.dist-info/entry_points.txt,sha256=AFrr1BzQTo-kc4cYE-uq4qcpUq1AzMyHDKojXb5xXa0,197
73
+ gptmed-0.7.0.dist-info/top_level.txt,sha256=mhyEq3rG33t21ziJz5w3TPgx0RjPf4zXMNUx2JTiNmE,7
74
+ gptmed-0.7.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
+ data-preparation = gptmed.data_preparation.cli:main
2
3
  gptmed = gptmed.framework.cli:main
3
4
  gptmed-generate = gptmed.inference.generator:main
4
5
  gptmed-train = gptmed.training.train:main
File without changes