gptmed 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ """
2
+ Text data preprocessing and cleaning module
3
+
4
+ Handles text normalization, cleaning, tokenization, and validation
5
+ """
6
+
7
+ import re
8
+ import string
9
+ import unicodedata
10
+ import logging
11
+ from typing import Any, Dict, List, Optional
12
+ from pathlib import Path
13
+ import json
14
+
15
+ from ..base import BaseDataPreprocessor, PreprocessingConfig
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class TextPreprocessor(BaseDataPreprocessor):
22
+ """
23
+ Text preprocessing with cleaning, normalization, and validation
24
+
25
+ Features:
26
+ - Text cleaning (whitespace, special characters)
27
+ - Case normalization
28
+ - Unicode normalization
29
+ - Stopword removal
30
+ - Punctuation handling
31
+ - Language detection
32
+ - Sentiment preservation
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ config: Optional[PreprocessingConfig] = None,
38
+ remove_stopwords: bool = False,
39
+ remove_punctuation: bool = False,
40
+ lowercase: bool = True,
41
+ min_length: int = 3,
42
+ max_length: Optional[int] = None,
43
+ ):
44
+ """
45
+ Initialize text preprocessor
46
+
47
+ Args:
48
+ config: PreprocessingConfig instance
49
+ remove_stopwords: Whether to remove common stopwords
50
+ remove_punctuation: Whether to remove punctuation
51
+ lowercase: Whether to convert to lowercase
52
+ min_length: Minimum text length to keep
53
+ max_length: Maximum text length (None for unlimited)
54
+ """
55
+ if config is None:
56
+ config = PreprocessingConfig(
57
+ input_path="./data/raw",
58
+ output_path="./data/processed",
59
+ data_type="text"
60
+ )
61
+
62
+ super().__init__(config)
63
+
64
+ self.remove_stopwords = remove_stopwords
65
+ self.remove_punctuation = remove_punctuation
66
+ self.lowercase = lowercase
67
+ self.min_length = min_length
68
+ self.max_length = max_length
69
+
70
+ # Load stopwords
71
+ self.stopwords = self._load_stopwords() if remove_stopwords else set()
72
+
73
+ def _load_stopwords(self) -> set:
74
+ """Load common English stopwords"""
75
+ # Basic English stopwords
76
+ stopwords = {
77
+ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
78
+ 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
79
+ 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'is', 'was',
80
+ 'are', 'been', 'were', 'or', 'an', 'which', 'their', 'what',
81
+ 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'them', 'me',
82
+ }
83
+ return stopwords
84
+
85
+ def validate(self, data: Any) -> bool:
86
+ """
87
+ Validate text input
88
+
89
+ Args:
90
+ data: Input text
91
+
92
+ Returns:
93
+ True if valid, False otherwise
94
+ """
95
+ if not isinstance(data, str):
96
+ self.logger.warning(f"Invalid text type: {type(data)}")
97
+ return False
98
+
99
+ if len(data.strip()) < self.min_length:
100
+ self.logger.debug(f"Text too short: {len(data)}")
101
+ return False
102
+
103
+ if self.max_length and len(data) > self.max_length:
104
+ self.logger.debug(f"Text too long: {len(data)}")
105
+ return False
106
+
107
+ return True
108
+
109
+ def clean(self, text: str) -> str:
110
+ """
111
+ Clean text by removing artifacts and normalizing
112
+
113
+ Args:
114
+ text: Raw text
115
+
116
+ Returns:
117
+ Cleaned text
118
+ """
119
+ # Remove leading/trailing whitespace
120
+ text = text.strip()
121
+
122
+ # Normalize unicode (NFD - decomposed form)
123
+ text = unicodedata.normalize('NFD', text)
124
+ text = ''.join(ch for ch in text if unicodedata.category(ch) != 'Mn')
125
+
126
+ # Remove HTML tags
127
+ text = re.sub(r'<[^>]+>', '', text)
128
+
129
+ # Remove URLs
130
+ text = re.sub(r'http[s]?://\S+', '', text)
131
+ text = re.sub(r'www\.\S+', '', text)
132
+
133
+ # Remove email addresses
134
+ text = re.sub(r'\S+@\S+', '', text)
135
+
136
+ # Remove extra whitespace
137
+ text = re.sub(r'\s+', ' ', text)
138
+
139
+ # Remove common control characters
140
+ text = ''.join(ch for ch in text if ch.isprintable() or ch.isspace())
141
+
142
+ return text.strip()
143
+
144
+ def normalize(self, text: str) -> str:
145
+ """
146
+ Normalize text
147
+
148
+ Args:
149
+ text: Cleaned text
150
+
151
+ Returns:
152
+ Normalized text
153
+ """
154
+ # Convert to lowercase if specified
155
+ if self.lowercase:
156
+ text = text.lower()
157
+
158
+ # Optional punctuation removal
159
+ if self.remove_punctuation:
160
+ text = text.translate(str.maketrans('', '', string.punctuation))
161
+ else:
162
+ # Just normalize spacing around punctuation
163
+ text = re.sub(r'\s+([.!?,;:])', r'\1', text)
164
+
165
+ # Remove stopwords if specified
166
+ if self.remove_stopwords:
167
+ words = text.split()
168
+ words = [w for w in words if w not in self.stopwords]
169
+ text = ' '.join(words)
170
+
171
+ return text.strip()
172
+
173
+ def tokenize(self, text: str) -> List[str]:
174
+ """
175
+ Simple word tokenization
176
+
177
+ Args:
178
+ text: Text to tokenize
179
+
180
+ Returns:
181
+ List of tokens
182
+ """
183
+ # Process text first
184
+ processed = self.process(text)
185
+ if processed is None:
186
+ return []
187
+
188
+ return processed.split()
189
+
190
+ def get_text_stats(self, text: str) -> Dict[str, Any]:
191
+ """
192
+ Get statistics about the text
193
+
194
+ Args:
195
+ text: Input text
196
+
197
+ Returns:
198
+ Dictionary with text statistics
199
+ """
200
+ processed = self.process(text)
201
+ if processed is None:
202
+ return {}
203
+
204
+ words = processed.split()
205
+ sentences = re.split(r'[.!?]+', processed)
206
+ sentences = [s.strip() for s in sentences if s.strip()]
207
+
208
+ return {
209
+ 'original_length': len(text),
210
+ 'cleaned_length': len(processed),
211
+ 'word_count': len(words),
212
+ 'sentence_count': len(sentences),
213
+ 'avg_word_length': sum(len(w) for w in words) / len(words) if words else 0,
214
+ 'unique_words': len(set(words)),
215
+ 'vocabulary_diversity': len(set(words)) / len(words) if words else 0,
216
+ }
217
+
218
+ def batch_process_files(
219
+ self,
220
+ input_dir: str,
221
+ output_dir: Optional[str] = None,
222
+ pattern: str = "*.txt"
223
+ ) -> Dict[str, Any]:
224
+ """
225
+ Process multiple text files from a directory
226
+
227
+ Args:
228
+ input_dir: Input directory path
229
+ output_dir: Output directory path (uses config if None)
230
+ pattern: File pattern to match
231
+
232
+ Returns:
233
+ Processing results
234
+ """
235
+ output_dir = output_dir or self.config.output_path
236
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
237
+
238
+ input_path = Path(input_dir)
239
+ results = []
240
+
241
+ for file_path in input_path.glob(pattern):
242
+ try:
243
+ with open(file_path, 'r', encoding='utf-8') as f:
244
+ text = f.read()
245
+
246
+ processed = self.process(text)
247
+
248
+ if processed:
249
+ output_file = Path(output_dir) / file_path.name
250
+ with open(output_file, 'w', encoding='utf-8') as f:
251
+ f.write(processed)
252
+
253
+ results.append({
254
+ 'file': str(file_path),
255
+ 'status': 'success',
256
+ 'stats': self.get_text_stats(text)
257
+ })
258
+
259
+ except Exception as e:
260
+ self.logger.error(f"Error processing {file_path}: {str(e)}")
261
+ results.append({
262
+ 'file': str(file_path),
263
+ 'status': 'error',
264
+ 'error': str(e)
265
+ })
266
+
267
+ self.logger.info(f"Processed {len(results)} files")
268
+ return {'results': results, 'stats': self.get_statistics()}
@@ -0,0 +1,415 @@
1
+ """
2
+ Video data preprocessing and cleaning module
3
+
4
+ Handles video frame extraction, resizing, quality checks, and metadata extraction
5
+ """
6
+
7
+ import logging
8
+ from typing import Any, Dict, List, Optional, Tuple
9
+ from pathlib import Path
10
+ import json
11
+
12
+ from ..base import BaseDataPreprocessor, PreprocessingConfig
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class VideoPreprocessor(BaseDataPreprocessor):
19
+ """
20
+ Video preprocessing with frame extraction, resizing, and validation
21
+
22
+ Features:
23
+ - Video format validation
24
+ - Frame extraction at specified intervals
25
+ - Resolution resizing
26
+ - Frame rate conversion
27
+ - Bitrate analysis
28
+ - Duration validation
29
+ - Metadata extraction
30
+ - Codec detection
31
+ - Corruption detection
32
+ - Thumbnail generation
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ config: Optional[PreprocessingConfig] = None,
38
+ target_fps: int = 30,
39
+ target_resolution: Tuple[int, int] = (640, 480),
40
+ preserve_aspect_ratio: bool = True,
41
+ min_duration: float = 1.0, # seconds
42
+ max_duration: Optional[float] = None, # seconds
43
+ min_width: int = 320,
44
+ min_height: int = 240,
45
+ frame_extraction_interval: Optional[int] = None, # Extract every N frames
46
+ extract_frames: bool = False,
47
+ supported_formats: Optional[List[str]] = None,
48
+ ):
49
+ """
50
+ Initialize video preprocessor
51
+
52
+ Args:
53
+ config: PreprocessingConfig instance
54
+ target_fps: Target frames per second
55
+ target_resolution: Target resolution (width, height)
56
+ preserve_aspect_ratio: Whether to preserve aspect ratio
57
+ min_duration: Minimum video duration in seconds
58
+ max_duration: Maximum video duration in seconds
59
+ min_width: Minimum video width
60
+ min_height: Minimum video height
61
+ frame_extraction_interval: Extract every N frames (None = no extraction)
62
+ extract_frames: Whether to extract frames to disk
63
+ supported_formats: List of supported video formats
64
+ """
65
+ if config is None:
66
+ config = PreprocessingConfig(
67
+ input_path="./data/raw/videos",
68
+ output_path="./data/processed/videos",
69
+ data_type="video"
70
+ )
71
+
72
+ super().__init__(config)
73
+
74
+ self.target_fps = target_fps
75
+ self.target_resolution = target_resolution
76
+ self.preserve_aspect_ratio = preserve_aspect_ratio
77
+ self.min_duration = min_duration
78
+ self.max_duration = max_duration
79
+ self.min_width = min_width
80
+ self.min_height = min_height
81
+ self.frame_extraction_interval = frame_extraction_interval
82
+ self.extract_frames = extract_frames
83
+ self.supported_formats = supported_formats or ['mp4', 'avi', 'mov', 'mkv', 'flv', 'wmv']
84
+
85
+ self._import_video_library()
86
+
87
+ def _import_video_library(self):
88
+ """Attempt to import opencv and other video libraries"""
89
+ self.opencv_available = False
90
+ self.ffmpeg_available = False
91
+
92
+ try:
93
+ import cv2
94
+ self.cv2 = cv2
95
+ self.opencv_available = True
96
+ except ImportError:
97
+ self.logger.warning(
98
+ "OpenCV not available. Install with: pip install opencv-python"
99
+ )
100
+
101
+ try:
102
+ import subprocess
103
+ result = subprocess.run(['ffmpeg', '-version'], capture_output=True)
104
+ self.ffmpeg_available = result.returncode == 0
105
+ except:
106
+ self.logger.warning(
107
+ "ffmpeg not available. Install from: https://ffmpeg.org/download.html"
108
+ )
109
+
110
+ def validate(self, data: Any) -> bool:
111
+ """
112
+ Validate video input
113
+
114
+ Args:
115
+ data: Video file path (str)
116
+
117
+ Returns:
118
+ True if valid, False otherwise
119
+ """
120
+ if not self.opencv_available:
121
+ self.logger.error("OpenCV is required for video processing")
122
+ return False
123
+
124
+ try:
125
+ if not isinstance(data, str):
126
+ self.logger.warning(f"Invalid video type: {type(data)}")
127
+ return False
128
+
129
+ video_path = Path(data)
130
+ if not video_path.exists():
131
+ self.logger.warning(f"Video file not found: {data}")
132
+ return False
133
+
134
+ if not any(str(video_path).lower().endswith(f) for f in self.supported_formats):
135
+ self.logger.warning(f"Unsupported format: {data}")
136
+ return False
137
+
138
+ # Try to open video
139
+ cap = self.cv2.VideoCapture(str(video_path))
140
+ if not cap.isOpened():
141
+ self.logger.warning(f"Cannot open video: {data}")
142
+ return False
143
+
144
+ # Check properties
145
+ fps = cap.get(self.cv2.CAP_PROP_FPS)
146
+ width = int(cap.get(self.cv2.CAP_PROP_FRAME_WIDTH))
147
+ height = int(cap.get(self.cv2.CAP_PROP_FRAME_HEIGHT))
148
+ frame_count = int(cap.get(self.cv2.CAP_PROP_FRAME_COUNT))
149
+
150
+ cap.release()
151
+
152
+ # Validate dimensions
153
+ if width < self.min_width or height < self.min_height:
154
+ self.logger.warning(f"Video resolution too small: {width}x{height}")
155
+ return False
156
+
157
+ # Validate duration
158
+ duration = frame_count / fps if fps > 0 else 0
159
+ if duration < self.min_duration:
160
+ self.logger.warning(f"Video too short: {duration:.2f}s")
161
+ return False
162
+
163
+ if self.max_duration and duration > self.max_duration:
164
+ self.logger.warning(f"Video too long: {duration:.2f}s")
165
+ return False
166
+
167
+ return True
168
+
169
+ except Exception as e:
170
+ self.logger.error(f"Video validation error: {str(e)}")
171
+ return False
172
+
173
+ def clean(self, video_path: str) -> Any:
174
+ """
175
+ Clean video data (basic validation)
176
+
177
+ Args:
178
+ video_path: Path to video file
179
+
180
+ Returns:
181
+ OpenCV VideoCapture object or None
182
+ """
183
+ try:
184
+ cap = self.cv2.VideoCapture(str(video_path))
185
+ if not cap.isOpened():
186
+ raise Exception(f"Cannot open video: {video_path}")
187
+
188
+ return cap
189
+
190
+ except Exception as e:
191
+ self.logger.error(f"Video cleaning error: {str(e)}")
192
+ return None
193
+
194
+ def normalize(self, video_cap: Any) -> Any:
195
+ """
196
+ Normalize video properties
197
+
198
+ Args:
199
+ video_cap: OpenCV VideoCapture object
200
+
201
+ Returns:
202
+ VideoCapture with normalized properties
203
+ """
204
+ # Note: OpenCV doesn't allow changing FPS on the fly
205
+ # Normalization happens during frame extraction
206
+ return video_cap
207
+
208
+ def extract_frames(
209
+ self,
210
+ video_path: str,
211
+ output_dir: str,
212
+ sample_rate: int = 1,
213
+ ) -> List[str]:
214
+ """
215
+ Extract frames from video
216
+
217
+ Args:
218
+ video_path: Path to video file
219
+ output_dir: Directory to save frames
220
+ sample_rate: Extract every Nth frame
221
+
222
+ Returns:
223
+ List of extracted frame paths
224
+ """
225
+ if not self.opencv_available:
226
+ self.logger.error("OpenCV is required")
227
+ return []
228
+
229
+ try:
230
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
231
+
232
+ cap = self.cv2.VideoCapture(str(video_path))
233
+ frame_count = 0
234
+ extracted_count = 0
235
+ extracted_frames = []
236
+
237
+ while True:
238
+ ret, frame = cap.read()
239
+ if not ret:
240
+ break
241
+
242
+ if frame_count % sample_rate == 0:
243
+ # Resize frame
244
+ resized = self._resize_frame(frame)
245
+
246
+ frame_path = Path(output_dir) / f"frame_{extracted_count:06d}.jpg"
247
+ self.cv2.imwrite(str(frame_path), resized)
248
+ extracted_frames.append(str(frame_path))
249
+ extracted_count += 1
250
+
251
+ frame_count += 1
252
+
253
+ cap.release()
254
+ self.logger.info(f"Extracted {extracted_count} frames from video")
255
+
256
+ return extracted_frames
257
+
258
+ except Exception as e:
259
+ self.logger.error(f"Frame extraction error: {str(e)}")
260
+ return []
261
+
262
+ def _resize_frame(self, frame: Any) -> Any:
263
+ """
264
+ Resize a single frame while preserving aspect ratio
265
+
266
+ Args:
267
+ frame: OpenCV frame
268
+
269
+ Returns:
270
+ Resized frame
271
+ """
272
+ if self.preserve_aspect_ratio:
273
+ h, w = frame.shape[:2]
274
+ scale = min(
275
+ self.target_resolution[0] / w,
276
+ self.target_resolution[1] / h
277
+ )
278
+ new_w = int(w * scale)
279
+ new_h = int(h * scale)
280
+ resized = self.cv2.resize(frame, (new_w, new_h), interpolation=self.cv2.INTER_LANCZOS4)
281
+
282
+ # Pad to target size
283
+ top = (self.target_resolution[1] - new_h) // 2
284
+ bottom = self.target_resolution[1] - new_h - top
285
+ left = (self.target_resolution[0] - new_w) // 2
286
+ right = self.target_resolution[0] - new_w - left
287
+
288
+ padded = self.cv2.copyMakeBorder(
289
+ resized, top, bottom, left, right,
290
+ self.cv2.BORDER_CONSTANT, value=[0, 0, 0]
291
+ )
292
+ return padded
293
+ else:
294
+ return self.cv2.resize(frame, self.target_resolution, interpolation=self.cv2.INTER_LANCZOS4)
295
+
296
+ def get_video_stats(self, video_path: str) -> Dict[str, Any]:
297
+ """
298
+ Get statistics about video
299
+
300
+ Args:
301
+ video_path: Path to video file
302
+
303
+ Returns:
304
+ Dictionary with video statistics
305
+ """
306
+ try:
307
+ if not self.opencv_available:
308
+ return {}
309
+
310
+ cap = self.cv2.VideoCapture(str(video_path))
311
+
312
+ fps = cap.get(self.cv2.CAP_PROP_FPS)
313
+ width = int(cap.get(self.cv2.CAP_PROP_FRAME_WIDTH))
314
+ height = int(cap.get(self.cv2.CAP_PROP_FRAME_HEIGHT))
315
+ frame_count = int(cap.get(self.cv2.CAP_PROP_FRAME_COUNT))
316
+
317
+ cap.release()
318
+
319
+ duration = frame_count / fps if fps > 0 else 0
320
+
321
+ stats = {
322
+ 'file': str(video_path),
323
+ 'width': width,
324
+ 'height': height,
325
+ 'fps': float(fps),
326
+ 'frame_count': frame_count,
327
+ 'duration_seconds': float(duration),
328
+ 'resolution': f"{width}x{height}",
329
+ 'file_size_bytes': Path(video_path).stat().st_size,
330
+ 'aspect_ratio': width / height if height > 0 else 0,
331
+ }
332
+
333
+ return stats
334
+
335
+ except Exception as e:
336
+ self.logger.error(f"Error getting video stats: {str(e)}")
337
+ return {}
338
+
339
+ def batch_process_directory(
340
+ self,
341
+ input_dir: str,
342
+ output_dir: Optional[str] = None,
343
+ extract_frames: bool = False,
344
+ frame_sample_rate: int = 30, # Extract every 30th frame
345
+ ) -> Dict[str, Any]:
346
+ """
347
+ Process all videos in a directory
348
+
349
+ Args:
350
+ input_dir: Input directory path
351
+ output_dir: Output directory path
352
+ extract_frames: Whether to extract frames
353
+ frame_sample_rate: Sample rate for frame extraction
354
+
355
+ Returns:
356
+ Processing results
357
+ """
358
+ if not self.opencv_available:
359
+ self.logger.error("OpenCV is required")
360
+ return {'error': 'OpenCV not available'}
361
+
362
+ output_dir = output_dir or self.config.output_path
363
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
364
+
365
+ input_path = Path(input_dir)
366
+ results = []
367
+
368
+ for video_file in input_path.iterdir():
369
+ if video_file.suffix.lower()[1:] not in self.supported_formats:
370
+ continue
371
+
372
+ try:
373
+ # Validate
374
+ if not self.validate(str(video_file)):
375
+ self.stats['skipped'] += 1
376
+ continue
377
+
378
+ # Process video
379
+ cap = self.clean(str(video_file))
380
+ if cap is None:
381
+ raise Exception("Failed to open video")
382
+
383
+ normalized = self.normalize(cap)
384
+ cap.release()
385
+
386
+ # Extract frames if requested
387
+ frame_list = []
388
+ if extract_frames:
389
+ frames_dir = Path(output_dir) / video_file.stem / "frames"
390
+ frame_list = self.extract_frames(
391
+ str(video_file),
392
+ str(frames_dir),
393
+ sample_rate=frame_sample_rate
394
+ )
395
+
396
+ self.stats['output_count'] += 1
397
+
398
+ results.append({
399
+ 'file': str(video_file),
400
+ 'status': 'success',
401
+ 'frames_extracted': len(frame_list),
402
+ 'stats': self.get_video_stats(str(video_file))
403
+ })
404
+
405
+ except Exception as e:
406
+ self.logger.error(f"Error processing {video_file}: {str(e)}")
407
+ self.stats['errors'] += 1
408
+ results.append({
409
+ 'file': str(video_file),
410
+ 'status': 'error',
411
+ 'error': str(e)
412
+ })
413
+
414
+ self.logger.info(f"Processed {self.stats['output_count']} videos")
415
+ return {'results': results, 'stats': self.get_statistics()}
@@ -3,6 +3,22 @@ from .startproject import startproject
3
3
 
4
4
  def main():
5
5
  if len(sys.argv) < 3 or sys.argv[1] != "startproject":
6
- print("Usage: gptmed startproject <projectname>")
6
+ print("Usage: gptmed startproject <projectname> [--qna|--conversational]")
7
7
  sys.exit(1)
8
- startproject(sys.argv[2])
8
+
9
+ project_name = sys.argv[2]
10
+ project_type = None
11
+
12
+ # Check for flags
13
+ if len(sys.argv) > 3:
14
+ flag = sys.argv[3]
15
+ if flag == "--qna":
16
+ project_type = "qna"
17
+ elif flag == "--conversational":
18
+ project_type = "conversational"
19
+ else:
20
+ print(f"Invalid flag: {flag}")
21
+ print("Usage: gptmed startproject <projectname> [--qna|--conversational]")
22
+ sys.exit(1)
23
+
24
+ startproject(project_name, project_type)