massgen 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (58) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  3. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  4. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  6. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  7. massgen/backend/docs/permissions_and_context_files.md +2 -2
  8. massgen/backend/response.py +2 -0
  9. massgen/configs/README.md +49 -40
  10. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  11. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  12. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  13. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  14. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  15. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  16. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  17. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  18. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  19. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +1 -1
  20. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +1 -1
  21. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +1 -1
  22. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +1 -1
  23. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +1 -1
  24. massgen/filesystem_manager/_filesystem_manager.py +1 -0
  25. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  26. massgen/message_templates.py +160 -12
  27. massgen/orchestrator.py +16 -0
  28. massgen/tests/test_binary_file_blocking.py +274 -0
  29. massgen/tests/test_case_studies.md +12 -12
  30. massgen/tests/test_multimodal_size_limits.py +407 -0
  31. massgen/tool/_manager.py +7 -2
  32. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  33. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  34. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  35. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  36. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  37. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  38. massgen/tool/_multimodal_tools/understand_audio.py +19 -1
  39. massgen/tool/_multimodal_tools/understand_file.py +6 -1
  40. massgen/tool/_multimodal_tools/understand_image.py +112 -8
  41. massgen/tool/_multimodal_tools/understand_video.py +32 -5
  42. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  43. massgen/tool/docs/multimodal_tools.md +589 -0
  44. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/METADATA +96 -69
  45. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/RECORD +49 -40
  46. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +0 -67
  47. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +0 -68
  48. massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +0 -98
  49. massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +0 -54
  50. massgen/configs/tools/memory/README.md +0 -199
  51. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +0 -131
  52. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +0 -133
  53. massgen/configs/tools/memory/test_context_window_management.py +0 -286
  54. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +0 -97
  55. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
  56. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
  57. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
  58. {massgen-0.1.3.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
@@ -42,6 +42,7 @@ async def understand_audio(
42
42
  audio_paths: List[str],
43
43
  model: str = "gpt-4o-transcribe",
44
44
  allowed_paths: Optional[List[str]] = None,
45
+ agent_cwd: Optional[str] = None,
45
46
  ) -> ExecutionResult:
46
47
  """
47
48
  Transcribe audio file(s) to text using OpenAI's Transcription API.
@@ -55,6 +56,7 @@ async def understand_audio(
55
56
  - Absolute path: Must be within allowed directories
56
57
  model: Model to use (default: "gpt-4o-transcribe")
57
58
  allowed_paths: List of allowed base paths for validation (optional)
59
+ agent_cwd: Current working directory of the agent (optional)
58
60
 
59
61
  Returns:
60
62
  ExecutionResult containing:
@@ -108,10 +110,13 @@ async def understand_audio(
108
110
 
109
111
  for audio_path_str in audio_paths:
110
112
  # Resolve audio path
113
+ # Use agent_cwd if available, otherwise fall back to Path.cwd()
114
+ base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
115
+
111
116
  if Path(audio_path_str).is_absolute():
112
117
  audio_path = Path(audio_path_str).resolve()
113
118
  else:
114
- audio_path = (Path.cwd() / audio_path_str).resolve()
119
+ audio_path = (base_dir / audio_path_str).resolve()
115
120
 
116
121
  # Validate audio path
117
122
  _validate_path_access(audio_path, allowed_paths_list)
@@ -137,6 +142,19 @@ async def understand_audio(
137
142
  output_blocks=[TextContent(data=json.dumps(result, indent=2))],
138
143
  )
139
144
 
145
+ # Check file size (OpenAI Whisper API has 25MB limit)
146
+ file_size = audio_path.stat().st_size
147
+ max_size = 25 * 1024 * 1024 # 25MB
148
+ if file_size > max_size:
149
+ result = {
150
+ "success": False,
151
+ "operation": "generate_text_with_input_audio",
152
+ "error": f"Audio file too large: {audio_path} ({file_size/1024/1024:.1f}MB > 25MB). " "Please use a smaller file or compress the audio.",
153
+ }
154
+ return ExecutionResult(
155
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
156
+ )
157
+
140
158
  validated_audio_paths.append(audio_path)
141
159
 
142
160
  # Process each audio file separately using OpenAI Transcription API
@@ -199,6 +199,7 @@ async def understand_file(
199
199
  model: str = "gpt-4.1",
200
200
  max_chars: int = 50000,
201
201
  allowed_paths: Optional[List[str]] = None,
202
+ agent_cwd: Optional[str] = None,
202
203
  ) -> ExecutionResult:
203
204
  """
204
205
  Understand and analyze file contents using OpenAI's gpt-4.1 API.
@@ -216,6 +217,7 @@ async def understand_file(
216
217
  - Prevents processing extremely large files
217
218
  - Applies to both text files and extracted content from documents
218
219
  allowed_paths: List of allowed base paths for validation (optional)
220
+ agent_cwd: Agent's current working directory (automatically injected, optional)
219
221
 
220
222
  Returns:
221
223
  ExecutionResult containing:
@@ -306,10 +308,13 @@ async def understand_file(
306
308
  client = OpenAI(api_key=openai_api_key)
307
309
 
308
310
  # Resolve file path
311
+ # Use agent_cwd if available, otherwise fall back to Path.cwd()
312
+ base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
313
+
309
314
  if Path(file_path).is_absolute():
310
315
  f_path = Path(file_path).resolve()
311
316
  else:
312
- f_path = (Path.cwd() / file_path).resolve()
317
+ f_path = (base_dir / file_path).resolve()
313
318
 
314
319
  # Validate file path
315
320
  _validate_path_access(f_path, allowed_paths_list)
@@ -12,6 +12,7 @@ from typing import List, Optional
12
12
  from dotenv import load_dotenv
13
13
  from openai import OpenAI
14
14
 
15
+ from massgen.logger_config import logger
15
16
  from massgen.tool._result import ExecutionResult, TextContent
16
17
 
17
18
 
@@ -44,6 +45,7 @@ async def understand_image(
44
45
  prompt: str = "What's in this image? Please describe it in detail.",
45
46
  model: str = "gpt-4.1",
46
47
  allowed_paths: Optional[List[str]] = None,
48
+ agent_cwd: Optional[str] = None,
47
49
  ) -> ExecutionResult:
48
50
  """
49
51
  Understand and analyze an image using OpenAI's gpt-4.1 API.
@@ -58,6 +60,7 @@ async def understand_image(
58
60
  prompt: Question or instruction about the image (default: "What's in this image? Please describe it in detail.")
59
61
  model: Model to use (default: "gpt-4.1")
60
62
  allowed_paths: List of allowed base paths for validation (optional)
63
+ agent_cwd: Agent's current working directory (automatically injected)
61
64
 
62
65
  Returns:
63
66
  ExecutionResult containing:
@@ -111,10 +114,13 @@ async def understand_image(
111
114
  client = OpenAI(api_key=openai_api_key)
112
115
 
113
116
  # Resolve image path
117
+ # Use agent_cwd if available, otherwise fall back to Path.cwd()
118
+ base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
119
+
114
120
  if Path(image_path).is_absolute():
115
121
  img_path = Path(image_path).resolve()
116
122
  else:
117
- img_path = (Path.cwd() / image_path).resolve()
123
+ img_path = (base_dir / image_path).resolve()
118
124
 
119
125
  # Validate image path
120
126
  _validate_path_access(img_path, allowed_paths_list)
@@ -140,11 +146,112 @@ async def understand_image(
140
146
  output_blocks=[TextContent(data=json.dumps(result, indent=2))],
141
147
  )
142
148
 
143
- # Read and encode image to base64
149
+ # Read image and check size and dimensions
144
150
  try:
145
- with open(img_path, "rb") as image_file:
146
- image_data = image_file.read()
147
- base64_image = base64.b64encode(image_data).decode("utf-8")
151
+ # OpenAI Vision API limits:
152
+ # - Up to 20MB per image
153
+ # - High-resolution: 768px (short side) x 2000px (long side)
154
+ file_size = img_path.stat().st_size
155
+ max_size = 18 * 1024 * 1024 # 18MB (conservative buffer under OpenAI's 20MB limit)
156
+ max_short_side = 768 # Maximum pixels for short side
157
+ max_long_side = 2000 # Maximum pixels for long side
158
+
159
+ # Try to import PIL for dimension/size checking
160
+ try:
161
+ import io
162
+
163
+ from PIL import Image
164
+ except ImportError:
165
+ # PIL not available - fall back to simple file reading
166
+ # This will work for small images but may fail for large ones
167
+ if file_size > max_size:
168
+ result = {
169
+ "success": False,
170
+ "operation": "understand_image",
171
+ "error": f"Image too large ({file_size/1024/1024:.1f}MB > {max_size/1024/1024:.0f}MB) and PIL not available for resizing. Install with: pip install pillow",
172
+ }
173
+ return ExecutionResult(
174
+ output_blocks=[TextContent(data=json.dumps(result, indent=2))],
175
+ )
176
+ # Read without resizing
177
+ with open(img_path, "rb") as image_file:
178
+ image_data = image_file.read()
179
+ base64_image = base64.b64encode(image_data).decode("utf-8")
180
+ mime_type = "image/jpeg" if img_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
181
+ logger.info(f"Read image without dimension check (PIL not available): {img_path.name} ({file_size/1024/1024:.1f}MB)")
182
+
183
+ else:
184
+ # PIL available - check both file size and dimensions
185
+ img = Image.open(img_path)
186
+ img.size
187
+ original_width, original_height = img.size
188
+
189
+ # Determine short and long sides
190
+ short_side = min(original_width, original_height)
191
+ long_side = max(original_width, original_height)
192
+
193
+ # Check if we need to resize
194
+ needs_resize = False
195
+ resize_reason = []
196
+
197
+ if file_size > max_size:
198
+ needs_resize = True
199
+ resize_reason.append(f"file size {file_size/1024/1024:.1f}MB > {max_size/1024/1024:.0f}MB")
200
+
201
+ if short_side > max_short_side or long_side > max_long_side:
202
+ needs_resize = True
203
+ resize_reason.append(f"dimensions {original_width}x{original_height} exceed {max_short_side}x{max_long_side}")
204
+
205
+ if needs_resize:
206
+ # Calculate scale factor based on both size and dimensions
207
+ scale_factors = []
208
+
209
+ # Scale for file size (if needed)
210
+ if file_size > max_size:
211
+ # Estimate: reduce dimensions by sqrt of size ratio
212
+ size_scale = (max_size / file_size) ** 0.5 * 0.8 # 0.8 for safety margin
213
+ scale_factors.append(size_scale)
214
+
215
+ # Scale for dimensions (if needed)
216
+ if short_side > max_short_side or long_side > max_long_side:
217
+ # Calculate scale needed to fit within dimension constraints
218
+ short_scale = max_short_side / short_side if short_side > max_short_side else 1.0
219
+ long_scale = max_long_side / long_side if long_side > max_long_side else 1.0
220
+ dimension_scale = min(short_scale, long_scale) * 0.95 # 0.95 for safety margin
221
+ scale_factors.append(dimension_scale)
222
+
223
+ # Use the most restrictive scale factor
224
+ scale_factor = min(scale_factors)
225
+ new_width = int(original_width * scale_factor)
226
+ new_height = int(original_height * scale_factor)
227
+
228
+ # Resize image
229
+ img_resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
230
+
231
+ # Convert to bytes
232
+ img_byte_arr = io.BytesIO()
233
+ # Save as JPEG for better compression
234
+ img_resized.convert("RGB").save(img_byte_arr, format="JPEG", quality=85, optimize=True)
235
+ image_data = img_byte_arr.getvalue()
236
+
237
+ base64_image = base64.b64encode(image_data).decode("utf-8")
238
+ mime_type = "image/jpeg"
239
+
240
+ logger.info(
241
+ f"Resized image ({', '.join(resize_reason)}): "
242
+ f"{original_width}x{original_height} ({file_size/1024/1024:.1f}MB) -> "
243
+ f"{new_width}x{new_height} ({len(image_data)/1024/1024:.1f}MB)",
244
+ )
245
+
246
+ else:
247
+ # No resize needed - read normally
248
+ with open(img_path, "rb") as image_file:
249
+ image_data = image_file.read()
250
+ base64_image = base64.b64encode(image_data).decode("utf-8")
251
+ # Determine MIME type
252
+ mime_type = "image/jpeg" if img_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
253
+ logger.info(f"Image within limits: {original_width}x{original_height} ({file_size/1024/1024:.1f}MB)")
254
+
148
255
  except Exception as read_error:
149
256
  result = {
150
257
  "success": False,
@@ -155,9 +262,6 @@ async def understand_image(
155
262
  output_blocks=[TextContent(data=json.dumps(result, indent=2))],
156
263
  )
157
264
 
158
- # Determine MIME type
159
- mime_type = "image/jpeg" if img_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
160
-
161
265
  try:
162
266
  # Call OpenAI API for image understanding
163
267
  response = client.responses.create(
@@ -41,14 +41,14 @@ def _validate_path_access(path: Path, allowed_paths: Optional[List[Path]] = None
41
41
 
42
42
  def _extract_key_frames(video_path: Path, num_frames: int = 8) -> List[str]:
43
43
  """
44
- Extract key frames from a video file.
44
+ Extract key frames from a video file and resize them to fit OpenAI Vision API limits.
45
45
 
46
46
  Args:
47
47
  video_path: Path to the video file
48
48
  num_frames: Number of key frames to extract
49
49
 
50
50
  Returns:
51
- List of base64-encoded frame images
51
+ List of base64-encoded frame images (resized to fit 768px x 2000px limits)
52
52
 
53
53
  Raises:
54
54
  ImportError: If opencv-python is not installed
@@ -61,6 +61,10 @@ def _extract_key_frames(video_path: Path, num_frames: int = 8) -> List[str]:
61
61
  "opencv-python is required for video frame extraction. " "Please install it with: pip install opencv-python",
62
62
  )
63
63
 
64
+ # OpenAI Vision API limits for images (same as understand_image)
65
+ max_short_side = 768 # Maximum pixels for short side
66
+ max_long_side = 2000 # Maximum pixels for long side
67
+
64
68
  # Open the video file
65
69
  video = cv2.VideoCapture(str(video_path))
66
70
 
@@ -96,8 +100,26 @@ def _extract_key_frames(video_path: Path, num_frames: int = 8) -> List[str]:
96
100
  if not ret:
97
101
  continue
98
102
 
99
- # Encode frame to JPEG
100
- ret, buffer = cv2.imencode(".jpg", frame)
103
+ # Check and resize frame if needed to fit OpenAI Vision API limits
104
+ height, width = frame.shape[:2]
105
+ short_side = min(width, height)
106
+ long_side = max(width, height)
107
+
108
+ if short_side > max_short_side or long_side > max_long_side:
109
+ # Calculate scale factor to fit within dimension constraints
110
+ short_scale = max_short_side / short_side if short_side > max_short_side else 1.0
111
+ long_scale = max_long_side / long_side if long_side > max_long_side else 1.0
112
+ scale_factor = min(short_scale, long_scale) * 0.95 # 0.95 for safety margin
113
+
114
+ new_width = int(width * scale_factor)
115
+ new_height = int(height * scale_factor)
116
+
117
+ # Resize frame using LANCZOS (high quality)
118
+ frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LANCZOS4)
119
+
120
+ # Encode frame to JPEG with quality=85 (same as understand_image)
121
+ encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 85]
122
+ ret, buffer = cv2.imencode(".jpg", frame, encode_param)
101
123
 
102
124
  if not ret:
103
125
  continue
@@ -122,6 +144,7 @@ async def understand_video(
122
144
  num_frames: int = 8,
123
145
  model: str = "gpt-4.1",
124
146
  allowed_paths: Optional[List[str]] = None,
147
+ agent_cwd: Optional[str] = None,
125
148
  ) -> ExecutionResult:
126
149
  """
127
150
  Understand and analyze a video by extracting key frames and using OpenAI's gpt-4.1 API.
@@ -139,6 +162,7 @@ async def understand_video(
139
162
  - Recommended range: 4-16 frames
140
163
  model: Model to use (default: "gpt-4.1")
141
164
  allowed_paths: List of allowed base paths for validation (optional)
165
+ agent_cwd: Agent's current working directory (automatically injected, optional)
142
166
 
143
167
  Returns:
144
168
  ExecutionResult containing:
@@ -201,10 +225,13 @@ async def understand_video(
201
225
  client = OpenAI(api_key=openai_api_key)
202
226
 
203
227
  # Resolve video path
228
+ # Use agent_cwd if available, otherwise fall back to Path.cwd()
229
+ base_dir = Path(agent_cwd) if agent_cwd else Path.cwd()
230
+
204
231
  if Path(video_path).is_absolute():
205
232
  vid_path = Path(video_path).resolve()
206
233
  else:
207
- vid_path = (Path.cwd() / video_path).resolve()
234
+ vid_path = (base_dir / video_path).resolve()
208
235
 
209
236
  # Validate video path
210
237
  _validate_path_access(vid_path, allowed_paths_list)