massgen 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +7 -1
  5. massgen/backend/azure_openai.py +9 -1
  6. massgen/backend/base.py +56 -0
  7. massgen/backend/base_with_custom_tool_and_mcp.py +4 -4
  8. massgen/backend/capabilities.py +6 -6
  9. massgen/backend/chat_completions.py +18 -11
  10. massgen/backend/claude_code.py +9 -1
  11. massgen/backend/gemini.py +71 -6
  12. massgen/backend/gemini_utils.py +30 -0
  13. massgen/backend/grok.py +39 -6
  14. massgen/backend/response.py +18 -11
  15. massgen/chat_agent.py +9 -3
  16. massgen/cli.py +319 -43
  17. massgen/config_builder.py +163 -18
  18. massgen/configs/README.md +78 -20
  19. massgen/configs/basic/multi/three_agents_default.yaml +2 -2
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_mcp_example.yaml +67 -0
  27. massgen/configs/tools/custom_tools/crawl4ai_multi_agent_example.yaml +68 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/playwright_with_img_understanding.yaml +98 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/understand_video_example.yaml +54 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  35. massgen/configs/tools/memory/README.md +199 -0
  36. massgen/configs/tools/memory/gpt5mini_gemini_context_window_management.yaml +131 -0
  37. massgen/configs/tools/memory/gpt5mini_gemini_no_persistent_memory.yaml +133 -0
  38. massgen/configs/tools/memory/test_context_window_management.py +286 -0
  39. massgen/configs/tools/multimodal/gpt5mini_gpt5nano_documentation_evolution.yaml +97 -0
  40. massgen/configs/tools/planning/five_agents_discord_mcp_planning_mode.yaml +7 -29
  41. massgen/configs/tools/planning/five_agents_filesystem_mcp_planning_mode.yaml +5 -6
  42. massgen/configs/tools/planning/five_agents_notion_mcp_planning_mode.yaml +4 -4
  43. massgen/configs/tools/planning/five_agents_twitter_mcp_planning_mode.yaml +4 -4
  44. massgen/configs/tools/planning/gpt5_mini_case_study_mcp_planning_mode.yaml +2 -2
  45. massgen/docker/README.md +83 -0
  46. massgen/filesystem_manager/_code_execution_server.py +22 -7
  47. massgen/filesystem_manager/_docker_manager.py +21 -1
  48. massgen/filesystem_manager/_filesystem_manager.py +8 -0
  49. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  50. massgen/formatter/_gemini_formatter.py +73 -0
  51. massgen/frontend/coordination_ui.py +175 -257
  52. massgen/frontend/displays/base_display.py +29 -0
  53. massgen/frontend/displays/rich_terminal_display.py +155 -9
  54. massgen/frontend/displays/simple_display.py +21 -0
  55. massgen/frontend/displays/terminal_display.py +22 -2
  56. massgen/logger_config.py +50 -6
  57. massgen/message_templates.py +123 -3
  58. massgen/orchestrator.py +652 -44
  59. massgen/tests/test_code_execution.py +178 -0
  60. massgen/tests/test_intelligent_planning_mode.py +643 -0
  61. massgen/tests/test_orchestration_restart.py +204 -0
  62. massgen/token_manager/token_manager.py +13 -4
  63. massgen/tool/__init__.py +4 -0
  64. massgen/tool/_multimodal_tools/understand_audio.py +193 -0
  65. massgen/tool/_multimodal_tools/understand_file.py +550 -0
  66. massgen/tool/_multimodal_tools/understand_image.py +212 -0
  67. massgen/tool/_multimodal_tools/understand_video.py +313 -0
  68. massgen/tool/docs/multimodal_tools.md +779 -0
  69. massgen/tool/workflow_toolkits/__init__.py +26 -0
  70. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  71. massgen/utils.py +1 -0
  72. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/METADATA +57 -52
  73. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/RECORD +77 -49
  74. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/WHEEL +0 -0
  75. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/entry_points.txt +0 -0
  76. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/licenses/LICENSE +0 -0
  77. {massgen-0.1.1.dist-info → massgen-0.1.3.dist-info}/top_level.txt +0 -0
@@ -21,18 +21,14 @@ Tools provided:
21
21
  """
22
22
 
23
23
  import argparse
24
- import base64
25
24
  import difflib
26
25
  import filecmp
27
26
  import fnmatch
28
- import os
29
27
  import shutil
30
28
  from pathlib import Path
31
29
  from typing import Any, Dict, List, Optional, Tuple
32
30
 
33
31
  import fastmcp
34
- from dotenv import load_dotenv
35
- from openai import OpenAI
36
32
 
37
33
 
38
34
  def get_copy_file_pairs(
@@ -819,997 +815,4 @@ async def create_server() -> fastmcp.FastMCP:
819
815
  except Exception as e:
820
816
  return {"success": False, "operation": "compare_files", "error": str(e)}
821
817
 
822
- @mcp.tool()
823
- def generate_and_store_image_with_input_images(
824
- base_image_paths: List[str],
825
- prompt: str = "Create a variation of the provided images",
826
- model: str = "gpt-4.1",
827
- n: int = 1,
828
- storage_path: Optional[str] = None,
829
- ) -> Dict[str, Any]:
830
- """
831
- Create variations based on multiple input images using OpenAI's gpt-4.1 API.
832
-
833
- This tool generates image variations based on multiple base images using OpenAI's gpt-4.1 API
834
- and saves them to the workspace with automatic organization.
835
-
836
- Args:
837
- base_image_paths: List of paths to base images (PNG/JPEG files, less than 4MB)
838
- - Relative path: Resolved relative to workspace
839
- - Absolute path: Must be within allowed directories
840
- prompt: Text description for the variation (default: "Create a variation of the provided images")
841
- model: Model to use (default: "gpt-4.1")
842
- n: Number of variations to generate (default: 1)
843
- storage_path: Directory path where to save variations (optional)
844
- - Relative path: Resolved relative to workspace
845
- - Absolute path: Must be within allowed directories
846
- - None/empty: Saves to workspace root
847
-
848
- Returns:
849
- Dictionary containing:
850
- - success: Whether operation succeeded
851
- - operation: "generate_and_store_image_with_input_images"
852
- - note: Note about usage
853
- - images: List of generated images with file paths and metadata
854
- - model: Model used for generation
855
- - prompt: The prompt used
856
- - total_images: Total number of images generated
857
-
858
- Examples:
859
- generate_and_store_image_with_input_images(["cat.png", "dog.png"], "Combine these animals")
860
- → Generates a variation combining both images
861
-
862
- generate_and_store_image_with_input_images(["art/logo.png", "art/icon.png"], "Create a unified design")
863
- → Generates variations based on both images
864
-
865
- Security:
866
- - Requires valid OpenAI API key
867
- - Input images must be valid image files less than 4MB
868
- - Files are saved to specified path within workspace
869
- """
870
- from datetime import datetime
871
-
872
- try:
873
- # Load environment variables
874
- script_dir = Path(__file__).parent.parent.parent
875
- env_path = script_dir / ".env"
876
- if env_path.exists():
877
- load_dotenv(env_path)
878
- else:
879
- load_dotenv()
880
-
881
- openai_api_key = os.getenv("OPENAI_API_KEY")
882
-
883
- if not openai_api_key:
884
- return {
885
- "success": False,
886
- "operation": "generate_and_store_image_with_input_images",
887
- "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
888
- }
889
-
890
- # Initialize OpenAI client
891
- client = OpenAI(api_key=openai_api_key)
892
-
893
- # Prepare content list with prompt and images
894
- content = [{"type": "input_text", "text": prompt}]
895
-
896
- # Process and validate all input images
897
- validated_paths = []
898
- for image_path_str in base_image_paths:
899
- # Resolve image path
900
- if Path(image_path_str).is_absolute():
901
- image_path = Path(image_path_str).resolve()
902
- else:
903
- image_path = (Path.cwd() / image_path_str).resolve()
904
-
905
- # Validate image path
906
- _validate_path_access(image_path, mcp.allowed_paths)
907
-
908
- if not image_path.exists():
909
- return {
910
- "success": False,
911
- "operation": "generate_and_store_image_with_input_images",
912
- "error": f"Image file does not exist: {image_path}",
913
- }
914
-
915
- # Allow both PNG and JPEG formats
916
- if image_path.suffix.lower() not in [".png", ".jpg", ".jpeg"]:
917
- return {
918
- "success": False,
919
- "operation": "generate_and_store_image_with_input_images",
920
- "error": f"Image must be PNG or JPEG format: {image_path}",
921
- }
922
-
923
- # Check file size (must be less than 4MB)
924
- file_size = image_path.stat().st_size
925
- if file_size > 4 * 1024 * 1024:
926
- return {
927
- "success": False,
928
- "operation": "generate_and_store_image_with_input_images",
929
- "error": f"Image file too large (must be < 4MB): {image_path} is {file_size / (1024*1024):.2f}MB",
930
- }
931
-
932
- validated_paths.append(image_path)
933
-
934
- # Read and encode image to base64
935
- with open(image_path, "rb") as f:
936
- image_data = f.read()
937
- image_base64 = base64.b64encode(image_data).decode("utf-8")
938
-
939
- # Determine MIME type
940
- mime_type = "image/jpeg" if image_path.suffix.lower() in [".jpg", ".jpeg"] else "image/png"
941
-
942
- # Add image to content
943
- content.append(
944
- {
945
- "type": "input_image",
946
- "image_url": f"data:{mime_type};base64,{image_base64}",
947
- },
948
- )
949
-
950
- # Determine storage directory
951
- if storage_path:
952
- if Path(storage_path).is_absolute():
953
- storage_dir = Path(storage_path).resolve()
954
- else:
955
- storage_dir = (Path.cwd() / storage_path).resolve()
956
- else:
957
- storage_dir = Path.cwd()
958
-
959
- # Validate storage directory
960
- _validate_path_access(storage_dir, mcp.allowed_paths)
961
- storage_dir.mkdir(parents=True, exist_ok=True)
962
-
963
- try:
964
- # print("Content for OpenAI API:", str(content))
965
- # Generate variations using gpt-4.1 API with all images at once
966
- # append content to a file
967
- response = client.responses.create(
968
- model=model,
969
- input=[
970
- {
971
- "role": "user",
972
- "content": content,
973
- },
974
- ],
975
- tools=[{"type": "image_generation"}],
976
- )
977
-
978
- # Extract image generation calls from response
979
- image_generation_calls = [output for output in response.output if output.type == "image_generation_call"]
980
-
981
- all_variations = []
982
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
983
-
984
- # Process generated images
985
- for idx, output in enumerate(image_generation_calls):
986
- if hasattr(output, "result"):
987
- image_base64 = output.result
988
- image_bytes = base64.b64decode(image_base64)
989
-
990
- # Generate filename
991
- if len(image_generation_calls) > 1:
992
- filename = f"variation_{idx+1}_{timestamp}.png"
993
- else:
994
- filename = f"variation_{timestamp}.png"
995
-
996
- # Full file path
997
- file_path = storage_dir / filename
998
-
999
- # Save image
1000
- file_path.write_bytes(image_bytes)
1001
-
1002
- all_variations.append(
1003
- {
1004
- "source_images": [str(p) for p in validated_paths],
1005
- "file_path": str(file_path),
1006
- "filename": filename,
1007
- "size": len(image_bytes),
1008
- "index": idx,
1009
- },
1010
- )
1011
-
1012
- # If no images were generated, check for text response
1013
- if not all_variations:
1014
- text_outputs = [output.content for output in response.output if hasattr(output, "content")]
1015
- if text_outputs:
1016
- return {
1017
- "success": False,
1018
- "operation": "generate_and_store_image_with_input_images",
1019
- "error": f"No images generated. Response: {' '.join(text_outputs)}",
1020
- }
1021
-
1022
- except Exception as api_error:
1023
- return {
1024
- "success": False,
1025
- "operation": "generate_and_store_image_with_input_images",
1026
- "error": f"OpenAI API error: {str(api_error)}",
1027
- }
1028
-
1029
- return {
1030
- "success": True,
1031
- "operation": "generate_and_store_image_with_input_images",
1032
- "note": "If no input images were provided, you must use generate_and_store_image_no_input_images tool.",
1033
- "images": all_variations,
1034
- "model": model,
1035
- "prompt": prompt,
1036
- "total_images": len(all_variations),
1037
- }
1038
-
1039
- except Exception as e:
1040
- return {
1041
- "success": False,
1042
- "operation": "generate_and_store_image_with_input_images",
1043
- "error": f"Failed to generate variations: {str(e)}",
1044
- }
1045
-
1046
- @mcp.tool()
1047
- def generate_and_store_audio_no_input_audios(
1048
- prompt: str,
1049
- model: str = "gpt-4o-audio-preview",
1050
- voice: str = "alloy",
1051
- audio_format: str = "wav",
1052
- storage_path: Optional[str] = None,
1053
- ) -> Dict[str, Any]:
1054
- """
1055
- Generate audio from text using OpenAI's gpt-4o-audio-preview model and store it in the workspace.
1056
-
1057
- This tool generates audio speech from text prompts using OpenAI's audio generation API
1058
- and saves the audio files to the workspace with automatic organization.
1059
-
1060
- Args:
1061
- prompt: Text content to convert to audio speech
1062
- model: Model to use for generation (default: "gpt-4o-audio-preview")
1063
- voice: Voice to use for audio generation (default: "alloy")
1064
- Options: "alloy", "echo", "fable", "onyx", "nova", "shimmer"
1065
- audio_format: Audio format for output (default: "wav")
1066
- Options: "wav", "mp3", "opus", "aac", "flac"
1067
- storage_path: Directory path where to save the audio (optional)
1068
- - Relative path: Resolved relative to workspace (e.g., "audio/generated")
1069
- - Absolute path: Must be within allowed directories
1070
- - None/empty: Saves to workspace root
1071
-
1072
- Returns:
1073
- Dictionary containing:
1074
- - success: Whether operation succeeded
1075
- - operation: "generate_and_store_audio_no_input_audios"
1076
- - audio_file: Generated audio file with path and metadata
1077
- - model: Model used for generation
1078
- - prompt: The prompt used for generation
1079
- - voice: Voice used for generation
1080
- - format: Audio format used
1081
-
1082
- Examples:
1083
- generate_and_store_audio_no_input_audios("Is a golden retriever a good family dog?")
1084
- → Generates and saves to: 20240115_143022_audio.wav
1085
-
1086
- generate_and_store_audio_no_input_audios("Hello world", voice="nova", audio_format="mp3")
1087
- → Generates with nova voice and saves as: 20240115_143022_audio.mp3
1088
-
1089
- Security:
1090
- - Requires valid OpenAI API key (automatically detected from .env or environment)
1091
- - Files are saved to specified path within workspace
1092
- - Path must be within allowed directories
1093
- """
1094
- from datetime import datetime
1095
-
1096
- try:
1097
- # Load environment variables
1098
- script_dir = Path(__file__).parent.parent.parent
1099
- env_path = script_dir / ".env"
1100
- if env_path.exists():
1101
- load_dotenv(env_path)
1102
- else:
1103
- load_dotenv()
1104
-
1105
- openai_api_key = os.getenv("OPENAI_API_KEY")
1106
-
1107
- if not openai_api_key:
1108
- return {
1109
- "success": False,
1110
- "operation": "generate_and_store_audio_no_input_audios",
1111
- "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
1112
- }
1113
-
1114
- # Initialize OpenAI client
1115
- client = OpenAI(api_key=openai_api_key)
1116
-
1117
- # Determine storage directory
1118
- if storage_path:
1119
- if Path(storage_path).is_absolute():
1120
- storage_dir = Path(storage_path).resolve()
1121
- else:
1122
- storage_dir = (Path.cwd() / storage_path).resolve()
1123
- else:
1124
- storage_dir = Path.cwd()
1125
-
1126
- # Validate storage directory is within allowed paths
1127
- _validate_path_access(storage_dir, mcp.allowed_paths)
1128
-
1129
- # Create directory if it doesn't exist
1130
- storage_dir.mkdir(parents=True, exist_ok=True)
1131
-
1132
- try:
1133
- # Generate audio using OpenAI API
1134
- completion = client.chat.completions.create(
1135
- model=model,
1136
- modalities=["text", "audio"],
1137
- audio={"voice": voice, "format": audio_format},
1138
- messages=[
1139
- {
1140
- "role": "user",
1141
- "content": prompt,
1142
- },
1143
- ],
1144
- )
1145
-
1146
- # Check if audio data is available
1147
- if not completion.choices[0].message.audio or not completion.choices[0].message.audio.data:
1148
- return {
1149
- "success": False,
1150
- "operation": "generate_and_store_audio_no_input_audios",
1151
- "error": "No audio data received from API",
1152
- }
1153
-
1154
- # Decode audio data from base64
1155
- audio_bytes = base64.b64decode(completion.choices[0].message.audio.data)
1156
-
1157
- # Generate filename with timestamp
1158
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1159
-
1160
- # Clean prompt for filename (first 30 chars)
1161
- clean_prompt = "".join(c for c in prompt[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
1162
- clean_prompt = clean_prompt.replace(" ", "_")
1163
-
1164
- filename = f"{timestamp}_{clean_prompt}.{audio_format}"
1165
-
1166
- # Full file path
1167
- file_path = storage_dir / filename
1168
-
1169
- # Write audio to file
1170
- file_path.write_bytes(audio_bytes)
1171
- file_size = len(audio_bytes)
1172
-
1173
- # Get text response if available
1174
- text_response = completion.choices[0].message.content if completion.choices[0].message.content else None
1175
-
1176
- return {
1177
- "success": True,
1178
- "operation": "generate_and_store_audio_no_input_audios",
1179
- "audio_file": {
1180
- "file_path": str(file_path),
1181
- "filename": filename,
1182
- "size": file_size,
1183
- "format": audio_format,
1184
- },
1185
- "model": model,
1186
- "prompt": prompt,
1187
- "voice": voice,
1188
- "format": audio_format,
1189
- "text_response": text_response,
1190
- }
1191
-
1192
- except Exception as api_error:
1193
- return {
1194
- "success": False,
1195
- "operation": "generate_and_store_audio_no_input_audios",
1196
- "error": f"OpenAI API error: {str(api_error)}",
1197
- }
1198
-
1199
- except Exception as e:
1200
- return {
1201
- "success": False,
1202
- "operation": "generate_and_store_audio_no_input_audios",
1203
- "error": f"Failed to generate or save audio: {str(e)}",
1204
- }
1205
-
1206
- @mcp.tool()
1207
- def generate_and_store_image_no_input_images(
1208
- prompt: str,
1209
- model: str = "gpt-4.1",
1210
- storage_path: Optional[str] = None,
1211
- ) -> Dict[str, Any]:
1212
- """
1213
- Generate image using OpenAI's response with gpt-4.1 **WITHOUT ANY INPUT IMAGES** and store it in the workspace.
1214
-
1215
- This tool Generate image using OpenAI's response with gpt-4.1 **WITHOUT ANY INPUT IMAGES** and store it in the workspace.
1216
-
1217
- Args:
1218
- prompt: Text description of the image to generate
1219
- model: Model to use for generation (default: "gpt-4.1")
1220
- Options: "gpt-4.1"
1221
- n: Number of images to generate (default: 1)
1222
- - gpt-4.1: only 1
1223
- storage_path: Directory path where to save the image (optional)
1224
- - Relative path: Resolved relative to workspace (e.g., "images/generated")
1225
- - Absolute path: Must be within allowed directories
1226
- - None/empty: Saves to workspace root
1227
-
1228
- Returns:
1229
- Dictionary containing:
1230
- - success: Whether operation succeeded
1231
- - operation: "generate_and_store_image_no_input_images"
1232
- - note: Note about operation
1233
- - images: List of generated images with file paths and metadata
1234
- - model: Model used for generation
1235
- - prompt: The prompt used for generation
1236
- - total_images: Total number of images generated and saved
1237
- - images: List of generated images with file paths and metadata
1238
-
1239
- Examples:
1240
- generate_and_store_image_no_input_images("a cat in space")
1241
- → Generates and saves to: 20240115_143022_a_cat_in_space.png
1242
-
1243
- generate_and_store_image_no_input_images("sunset over mountains", storage_path="art/landscapes")
1244
- → Generates and saves to: art/landscapes/20240115_143022_sunset_over_mountains.png
1245
-
1246
- Security:
1247
- - Requires valid OpenAI API key (automatically detected from .env or environment)
1248
- - Files are saved to specified path within workspace
1249
- - Path must be within allowed directories
1250
-
1251
- Note:
1252
- API key is automatically detected in this order:
1253
- 1. First checks .env file in current directory or parent directories
1254
- 2. Then checks environment variables
1255
- """
1256
- from datetime import datetime
1257
-
1258
- try:
1259
- # Try to find and load .env file from multiple locations
1260
- # 1. Try loading from script directory
1261
- script_dir = Path(__file__).parent.parent.parent # Go up to project root
1262
- env_path = script_dir / ".env"
1263
- if env_path.exists():
1264
- load_dotenv(env_path)
1265
- else:
1266
- # 2. Try loading from current directory and parent directories
1267
- load_dotenv()
1268
-
1269
- # Get API key from environment (load_dotenv will have loaded .env file)
1270
- openai_api_key = os.getenv("OPENAI_API_KEY")
1271
-
1272
- if not openai_api_key:
1273
- return {
1274
- "success": False,
1275
- "operation": "generate_and_store_image",
1276
- "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
1277
- }
1278
-
1279
- # Initialize OpenAI client
1280
- client = OpenAI(api_key=openai_api_key)
1281
-
1282
- # Determine storage directory
1283
- if storage_path:
1284
- if Path(storage_path).is_absolute():
1285
- storage_dir = Path(storage_path).resolve()
1286
- else:
1287
- storage_dir = (Path.cwd() / storage_path).resolve()
1288
- else:
1289
- storage_dir = Path.cwd()
1290
-
1291
- # Validate storage directory is within allowed paths
1292
- _validate_path_access(storage_dir, mcp.allowed_paths)
1293
-
1294
- # Create directory if it doesn't exist
1295
- storage_dir.mkdir(parents=True, exist_ok=True)
1296
-
1297
- try:
1298
- # Generate image using OpenAI API with gpt-4.1 non-streaming format
1299
- response = client.responses.create(
1300
- model=model,
1301
- input=prompt,
1302
- tools=[{"type": "image_generation"}],
1303
- )
1304
-
1305
- # Extract image data from response
1306
- image_data = [output.result for output in response.output if output.type == "image_generation_call"]
1307
-
1308
- saved_images = []
1309
-
1310
- if image_data:
1311
- # Generate filename with timestamp
1312
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1313
-
1314
- # Clean prompt for filename
1315
- clean_prompt = "".join(c for c in prompt[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
1316
- clean_prompt = clean_prompt.replace(" ", "_")
1317
-
1318
- for idx, image_base64 in enumerate(image_data):
1319
- # Decode base64 image data
1320
- image_bytes = base64.b64decode(image_base64)
1321
-
1322
- # Add index if generating multiple images
1323
- if len(image_data) > 1:
1324
- filename = f"{timestamp}_{clean_prompt}_{idx+1}.png"
1325
- else:
1326
- filename = f"{timestamp}_{clean_prompt}.png"
1327
-
1328
- # Full file path
1329
- file_path = storage_dir / filename
1330
-
1331
- # Write image to file
1332
- file_path.write_bytes(image_bytes)
1333
- file_size = len(image_bytes)
1334
-
1335
- saved_images.append(
1336
- {
1337
- "file_path": str(file_path),
1338
- "filename": filename,
1339
- "size": file_size,
1340
- "index": idx,
1341
- },
1342
- )
1343
-
1344
- result = {
1345
- "success": True,
1346
- "operation": "generate_and_store_image_no_input_images",
1347
- "note": "New images are generated and saved to the specified path.",
1348
- "images": saved_images,
1349
- "model": model,
1350
- "prompt": prompt,
1351
- "total_images": len(saved_images),
1352
- }
1353
-
1354
- return result
1355
-
1356
- except Exception as api_error:
1357
- print(f"OpenAI API error: {str(api_error)}")
1358
- return {
1359
- "success": False,
1360
- "operation": "generate_and_store_image_no_input_images",
1361
- "error": f"OpenAI API error: {str(api_error)}",
1362
- }
1363
-
1364
- except Exception as e:
1365
- return {
1366
- "success": False,
1367
- "operation": "generate_and_store_image_no_input_images",
1368
- "error": f"Failed to generate or save image: {str(e)}",
1369
- }
1370
-
1371
- @mcp.tool()
1372
- def generate_text_with_input_audio(
1373
- audio_paths: List[str],
1374
- model: str = "gpt-4o-transcribe",
1375
- ) -> Dict[str, Any]:
1376
- """
1377
- Transcribe audio file(s) to text using OpenAI's Transcription API.
1378
-
1379
- This tool processes one or more audio files through OpenAI's Transcription API
1380
- to extract the text content from the audio. Each file is processed separately.
1381
-
1382
- Args:
1383
- audio_paths: List of paths to input audio files (WAV, MP3, M4A, etc.)
1384
- - Relative path: Resolved relative to workspace
1385
- - Absolute path: Must be within allowed directories
1386
- model: Model to use (default: "gpt-4o-transcribe")
1387
-
1388
- Returns:
1389
- Dictionary containing:
1390
- - success: Whether operation succeeded
1391
- - operation: "generate_text_with_input_audio"
1392
- - transcriptions: List of transcription results for each file
1393
- - audio_files: List of paths to the input audio files
1394
- - model: Model used
1395
-
1396
- Examples:
1397
- generate_text_with_input_audio(["recording.wav"])
1398
- → Returns transcription for recording.wav
1399
-
1400
- generate_text_with_input_audio(["interview1.mp3", "interview2.mp3"])
1401
- → Returns separate transcriptions for each file
1402
-
1403
- Security:
1404
- - Requires valid OpenAI API key
1405
- - All input audio files must exist and be readable
1406
- """
1407
- try:
1408
- # Load environment variables
1409
- script_dir = Path(__file__).parent.parent.parent
1410
- env_path = script_dir / ".env"
1411
- if env_path.exists():
1412
- load_dotenv(env_path)
1413
- else:
1414
- load_dotenv()
1415
-
1416
- openai_api_key = os.getenv("OPENAI_API_KEY")
1417
-
1418
- if not openai_api_key:
1419
- return {
1420
- "success": False,
1421
- "operation": "generate_text_with_input_audio",
1422
- "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
1423
- }
1424
-
1425
- # Initialize OpenAI client
1426
- client = OpenAI(api_key=openai_api_key)
1427
-
1428
- # Validate and process input audio files
1429
- validated_audio_paths = []
1430
- audio_extensions = [".wav", ".mp3", ".m4a", ".mp4", ".ogg", ".flac", ".aac", ".wma", ".opus"]
1431
-
1432
- for audio_path_str in audio_paths:
1433
- # Resolve audio path
1434
- if Path(audio_path_str).is_absolute():
1435
- audio_path = Path(audio_path_str).resolve()
1436
- else:
1437
- audio_path = (Path.cwd() / audio_path_str).resolve()
1438
-
1439
- # Validate audio path
1440
- _validate_path_access(audio_path, mcp.allowed_paths)
1441
-
1442
- if not audio_path.exists():
1443
- return {
1444
- "success": False,
1445
- "operation": "generate_text_with_input_audio",
1446
- "error": f"Audio file does not exist: {audio_path}",
1447
- }
1448
-
1449
- # Check if file is an audio file
1450
- if audio_path.suffix.lower() not in audio_extensions:
1451
- return {
1452
- "success": False,
1453
- "operation": "generate_text_with_input_audio",
1454
- "error": f"File does not appear to be an audio file: {audio_path}",
1455
- }
1456
-
1457
- validated_audio_paths.append(audio_path)
1458
-
1459
- # Process each audio file separately using OpenAI Transcription API
1460
- transcriptions = []
1461
-
1462
- for audio_path in validated_audio_paths:
1463
- try:
1464
- # Open audio file
1465
- with open(audio_path, "rb") as audio_file:
1466
- # Basic transcription without prompt
1467
- transcription = client.audio.transcriptions.create(
1468
- model=model,
1469
- file=audio_file,
1470
- response_format="text",
1471
- )
1472
-
1473
- # Add transcription to list
1474
- transcriptions.append(
1475
- {
1476
- "file": str(audio_path),
1477
- "transcription": transcription,
1478
- },
1479
- )
1480
-
1481
- except Exception as api_error:
1482
- return {
1483
- "success": False,
1484
- "operation": "generate_text_with_input_audio",
1485
- "error": f"Transcription API error for file {audio_path}: {str(api_error)}",
1486
- }
1487
-
1488
- return {
1489
- "success": True,
1490
- "operation": "generate_text_with_input_audio",
1491
- "transcriptions": transcriptions,
1492
- "audio_files": [str(p) for p in validated_audio_paths],
1493
- "model": model,
1494
- }
1495
-
1496
- except Exception as e:
1497
- return {
1498
- "success": False,
1499
- "operation": "generate_text_with_input_audio",
1500
- "error": f"Failed to transcribe audio: {str(e)}",
1501
- }
1502
-
1503
- @mcp.tool()
1504
- def convert_text_to_speech(
1505
- input_text: str,
1506
- model: str = "gpt-4o-mini-tts",
1507
- voice: str = "alloy",
1508
- instructions: Optional[str] = None,
1509
- storage_path: Optional[str] = None,
1510
- audio_format: str = "mp3",
1511
- ) -> Dict[str, Any]:
1512
- """
1513
- Convert text (transcription) directly to speech using OpenAI's TTS API with streaming response.
1514
-
1515
- This tool converts text directly to speech audio using OpenAI's Text-to-Speech API,
1516
- designed specifically for converting transcriptions or any text content to spoken audio.
1517
- Uses streaming response for efficient file handling.
1518
-
1519
- Args:
1520
- input_text: The text content to convert to speech (e.g., transcription text)
1521
- model: TTS model to use (default: "gpt-4o-mini-tts")
1522
- Options: "gpt-4o-mini-tts", "tts-1", "tts-1-hd"
1523
- voice: Voice to use for speech synthesis (default: "alloy")
1524
- Options: "alloy", "echo", "fable", "onyx", "nova", "shimmer", "coral", "sage"
1525
- instructions: Optional speaking instructions for tone and style (e.g., "Speak in a cheerful tone")
1526
- storage_path: Directory path where to save the audio file (optional)
1527
- - Relative path: Resolved relative to workspace
1528
- - Absolute path: Must be within allowed directories
1529
- - None/empty: Saves to workspace root
1530
- audio_format: Output audio format (default: "mp3")
1531
- Options: "mp3", "opus", "aac", "flac", "wav", "pcm"
1532
-
1533
- Returns:
1534
- Dictionary containing:
1535
- - success: Whether operation succeeded
1536
- - operation: "convert_text_to_speech"
1537
- - audio_file: Generated audio file with path and metadata
1538
- - model: TTS model used
1539
- - voice: Voice used
1540
- - format: Audio format used
1541
- - text_length: Length of input text
1542
- - instructions: Speaking instructions if provided
1543
-
1544
- Examples:
1545
- convert_text_to_speech("Hello world, this is a test.")
1546
- → Converts text to speech and saves as MP3
1547
-
1548
- convert_text_to_speech(
1549
- "Today is a wonderful day to build something people love!",
1550
- voice="coral",
1551
- instructions="Speak in a cheerful and positive tone."
1552
- )
1553
- → Converts with specific voice and speaking instructions
1554
-
1555
- Security:
1556
- - Requires valid OpenAI API key
1557
- - Files are saved to specified path within workspace
1558
- - Path must be within allowed directories
1559
- """
1560
- from datetime import datetime
1561
-
1562
- try:
1563
- # Load environment variables
1564
- script_dir = Path(__file__).parent.parent.parent
1565
- env_path = script_dir / ".env"
1566
- if env_path.exists():
1567
- load_dotenv(env_path)
1568
- else:
1569
- load_dotenv()
1570
-
1571
- openai_api_key = os.getenv("OPENAI_API_KEY")
1572
-
1573
- if not openai_api_key:
1574
- return {
1575
- "success": False,
1576
- "operation": "convert_text_to_speech",
1577
- "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
1578
- }
1579
-
1580
- # Initialize OpenAI client
1581
- client = OpenAI(api_key=openai_api_key)
1582
-
1583
- # Determine storage directory
1584
- if storage_path:
1585
- if Path(storage_path).is_absolute():
1586
- storage_dir = Path(storage_path).resolve()
1587
- else:
1588
- storage_dir = (Path.cwd() / storage_path).resolve()
1589
- else:
1590
- storage_dir = Path.cwd()
1591
-
1592
- # Validate storage directory is within allowed paths
1593
- _validate_path_access(storage_dir, mcp.allowed_paths)
1594
-
1595
- # Create directory if it doesn't exist
1596
- storage_dir.mkdir(parents=True, exist_ok=True)
1597
-
1598
- # Generate filename with timestamp
1599
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1600
-
1601
- # Clean text for filename (first 30 chars)
1602
- clean_text = "".join(c for c in input_text[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
1603
- clean_text = clean_text.replace(" ", "_")
1604
-
1605
- filename = f"speech_{timestamp}_{clean_text}.{audio_format}"
1606
- file_path = storage_dir / filename
1607
-
1608
- try:
1609
- # Prepare request parameters
1610
- request_params = {
1611
- "model": model,
1612
- "voice": voice,
1613
- "input": input_text,
1614
- }
1615
-
1616
- # Add instructions if provided (only for models that support it)
1617
- if instructions and model in ["gpt-4o-mini-tts"]:
1618
- request_params["instructions"] = instructions
1619
-
1620
- # Use streaming response for efficient file handling
1621
- with client.audio.speech.with_streaming_response.create(**request_params) as response:
1622
- # Stream directly to file
1623
- response.stream_to_file(file_path)
1624
-
1625
- # Get file size
1626
- file_size = file_path.stat().st_size
1627
-
1628
- return {
1629
- "success": True,
1630
- "operation": "convert_text_to_speech",
1631
- "audio_file": {
1632
- "file_path": str(file_path),
1633
- "filename": filename,
1634
- "size": file_size,
1635
- "format": audio_format,
1636
- },
1637
- "model": model,
1638
- "voice": voice,
1639
- "format": audio_format,
1640
- "text_length": len(input_text),
1641
- "instructions": instructions if instructions else None,
1642
- }
1643
-
1644
- except Exception as api_error:
1645
- return {
1646
- "success": False,
1647
- "operation": "convert_text_to_speech",
1648
- "error": f"OpenAI TTS API error: {str(api_error)}",
1649
- }
1650
-
1651
- except Exception as e:
1652
- return {
1653
- "success": False,
1654
- "operation": "convert_text_to_speech",
1655
- "error": f"Failed to convert text to speech: {str(e)}",
1656
- }
1657
-
1658
- @mcp.tool()
1659
- def generate_and_store_video_no_input_images(
1660
- prompt: str,
1661
- model: str = "sora-2",
1662
- seconds: int = 4,
1663
- storage_path: Optional[str] = None,
1664
- ) -> Dict[str, Any]:
1665
- """
1666
- Generate a video from a text prompt using OpenAI's Sora-2 API.
1667
-
1668
- This tool generates a video based on a text prompt using OpenAI's Sora-2 API
1669
- and saves it to the workspace with automatic organization.
1670
-
1671
- Args:
1672
- prompt: Text description for the video to generate
1673
- model: Model to use (default: "sora-2")
1674
- storage_path: Directory path where to save the video (optional)
1675
- - Relative path: Resolved relative to workspace
1676
- - Absolute path: Must be within allowed directories
1677
- - None/empty: Saves to workspace root
1678
-
1679
- Returns:
1680
- Dictionary containing:
1681
- - success: Whether operation succeeded
1682
- - operation: "generate_and_store_video_no_input_images"
1683
- - video_path: Path to the saved video file
1684
- - model: Model used for generation
1685
- - prompt: The prompt used
1686
- - duration: Time taken for generation in seconds
1687
-
1688
- Examples:
1689
- generate_and_store_video_no_input_images("A cool cat on a motorcycle in the night")
1690
- → Generates a video and saves to workspace root
1691
-
1692
- generate_and_store_video_no_input_images("Dancing robot", storage_path="videos/")
1693
- → Generates a video and saves to videos/ directory
1694
-
1695
- Security:
1696
- - Requires valid OpenAI API key with Sora-2 access
1697
- - Files are saved to specified path within workspace
1698
- """
1699
- import time
1700
- from datetime import datetime
1701
-
1702
- try:
1703
- # Load environment variables
1704
- script_dir = Path(__file__).parent.parent.parent
1705
- env_path = script_dir / ".env"
1706
- if env_path.exists():
1707
- load_dotenv(env_path)
1708
- else:
1709
- load_dotenv()
1710
-
1711
- openai_api_key = os.getenv("OPENAI_API_KEY")
1712
-
1713
- if not openai_api_key:
1714
- return {
1715
- "success": False,
1716
- "operation": "generate_and_store_video_no_input_images",
1717
- "error": "OpenAI API key not found. Please set OPENAI_API_KEY in .env file or environment variable.",
1718
- }
1719
-
1720
- # Initialize OpenAI client
1721
- client = OpenAI(api_key=openai_api_key)
1722
-
1723
- # Determine storage directory
1724
- if storage_path:
1725
- if Path(storage_path).is_absolute():
1726
- storage_dir = Path(storage_path).resolve()
1727
- else:
1728
- storage_dir = (Path.cwd() / storage_path).resolve()
1729
- else:
1730
- storage_dir = Path.cwd()
1731
-
1732
- # Validate storage directory is within allowed paths
1733
- _validate_path_access(storage_dir, mcp.allowed_paths)
1734
-
1735
- # Create directory if it doesn't exist
1736
- storage_dir.mkdir(parents=True, exist_ok=True)
1737
-
1738
- try:
1739
- start_time = time.time()
1740
-
1741
- # Start video generation (no print statements to avoid MCP JSON parsing issues)
1742
- video = client.videos.create(
1743
- model=model,
1744
- prompt=prompt,
1745
- seconds=str(seconds),
1746
- )
1747
-
1748
- getattr(video, "progress", 0)
1749
-
1750
- # Monitor progress (silently, no stdout writes)
1751
- while video.status in ("in_progress", "queued"):
1752
- # Refresh status
1753
- video = client.videos.retrieve(video.id)
1754
- getattr(video, "progress", 0)
1755
- time.sleep(2)
1756
-
1757
- if video.status == "failed":
1758
- message = getattr(
1759
- getattr(video, "error", None),
1760
- "message",
1761
- "Video generation failed",
1762
- )
1763
- return {
1764
- "success": False,
1765
- "operation": "generate_and_store_video_no_input_images",
1766
- "error": message,
1767
- }
1768
-
1769
- # Download video content
1770
- content = client.videos.download_content(video.id, variant="video")
1771
-
1772
- # Generate filename with timestamp
1773
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
1774
- clean_prompt = "".join(c for c in prompt[:30] if c.isalnum() or c in (" ", "-", "_")).strip()
1775
- clean_prompt = clean_prompt.replace(" ", "_")
1776
- filename = f"{timestamp}_{clean_prompt}.mp4"
1777
-
1778
- # Full file path
1779
- file_path = storage_dir / filename
1780
-
1781
- # Write video to file
1782
- content.write_to_file(str(file_path))
1783
-
1784
- # Calculate duration
1785
- duration = time.time() - start_time
1786
-
1787
- # Get file size
1788
- file_size = file_path.stat().st_size
1789
-
1790
- return {
1791
- "success": True,
1792
- "operation": "generate_and_store_video_no_input_images",
1793
- "video_path": str(file_path),
1794
- "filename": filename,
1795
- "size": file_size,
1796
- "model": model,
1797
- "prompt": prompt,
1798
- "duration": duration,
1799
- }
1800
-
1801
- except Exception as api_error:
1802
- return {
1803
- "success": False,
1804
- "operation": "generate_and_store_video_no_input_images",
1805
- "error": f"OpenAI API error: {str(api_error)}",
1806
- }
1807
-
1808
- except Exception as e:
1809
- return {
1810
- "success": False,
1811
- "operation": "generate_and_store_video_no_input_images",
1812
- "error": f"Failed to generate or save video: {str(e)}",
1813
- }
1814
-
1815
818
  return mcp