ag2 0.9.1a1__py3-none-any.whl → 0.9.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ag2 might be problematic. Click here for more details.
- {ag2-0.9.1a1.dist-info → ag2-0.9.2.dist-info}/METADATA +272 -75
- ag2-0.9.2.dist-info/RECORD +406 -0
- {ag2-0.9.1a1.dist-info → ag2-0.9.2.dist-info}/WHEEL +1 -2
- autogen/__init__.py +89 -0
- autogen/_website/__init__.py +3 -0
- autogen/_website/generate_api_references.py +427 -0
- autogen/_website/generate_mkdocs.py +1174 -0
- autogen/_website/notebook_processor.py +476 -0
- autogen/_website/process_notebooks.py +656 -0
- autogen/_website/utils.py +412 -0
- autogen/agentchat/__init__.py +44 -0
- autogen/agentchat/agent.py +182 -0
- autogen/agentchat/assistant_agent.py +85 -0
- autogen/agentchat/chat.py +309 -0
- autogen/agentchat/contrib/__init__.py +5 -0
- autogen/agentchat/contrib/agent_eval/README.md +7 -0
- autogen/agentchat/contrib/agent_eval/agent_eval.py +108 -0
- autogen/agentchat/contrib/agent_eval/criterion.py +43 -0
- autogen/agentchat/contrib/agent_eval/critic_agent.py +44 -0
- autogen/agentchat/contrib/agent_eval/quantifier_agent.py +39 -0
- autogen/agentchat/contrib/agent_eval/subcritic_agent.py +45 -0
- autogen/agentchat/contrib/agent_eval/task.py +42 -0
- autogen/agentchat/contrib/agent_optimizer.py +429 -0
- autogen/agentchat/contrib/capabilities/__init__.py +5 -0
- autogen/agentchat/contrib/capabilities/agent_capability.py +20 -0
- autogen/agentchat/contrib/capabilities/generate_images.py +301 -0
- autogen/agentchat/contrib/capabilities/teachability.py +393 -0
- autogen/agentchat/contrib/capabilities/text_compressors.py +66 -0
- autogen/agentchat/contrib/capabilities/tools_capability.py +22 -0
- autogen/agentchat/contrib/capabilities/transform_messages.py +93 -0
- autogen/agentchat/contrib/capabilities/transforms.py +566 -0
- autogen/agentchat/contrib/capabilities/transforms_util.py +122 -0
- autogen/agentchat/contrib/capabilities/vision_capability.py +214 -0
- autogen/agentchat/contrib/captainagent/__init__.py +9 -0
- autogen/agentchat/contrib/captainagent/agent_builder.py +790 -0
- autogen/agentchat/contrib/captainagent/captainagent.py +512 -0
- autogen/agentchat/contrib/captainagent/tool_retriever.py +335 -0
- autogen/agentchat/contrib/captainagent/tools/README.md +44 -0
- autogen/agentchat/contrib/captainagent/tools/__init__.py +5 -0
- autogen/agentchat/contrib/captainagent/tools/data_analysis/calculate_correlation.py +40 -0
- autogen/agentchat/contrib/captainagent/tools/data_analysis/calculate_skewness_and_kurtosis.py +28 -0
- autogen/agentchat/contrib/captainagent/tools/data_analysis/detect_outlier_iqr.py +28 -0
- autogen/agentchat/contrib/captainagent/tools/data_analysis/detect_outlier_zscore.py +28 -0
- autogen/agentchat/contrib/captainagent/tools/data_analysis/explore_csv.py +21 -0
- autogen/agentchat/contrib/captainagent/tools/data_analysis/shapiro_wilk_test.py +30 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/arxiv_download.py +27 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/arxiv_search.py +53 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/extract_pdf_image.py +53 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/extract_pdf_text.py +38 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/get_wikipedia_text.py +21 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/get_youtube_caption.py +34 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/image_qa.py +60 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/optical_character_recognition.py +61 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/perform_web_search.py +47 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/scrape_wikipedia_tables.py +33 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/transcribe_audio_file.py +21 -0
- autogen/agentchat/contrib/captainagent/tools/information_retrieval/youtube_download.py +35 -0
- autogen/agentchat/contrib/captainagent/tools/math/calculate_circle_area_from_diameter.py +21 -0
- autogen/agentchat/contrib/captainagent/tools/math/calculate_day_of_the_week.py +18 -0
- autogen/agentchat/contrib/captainagent/tools/math/calculate_fraction_sum.py +28 -0
- autogen/agentchat/contrib/captainagent/tools/math/calculate_matrix_power.py +31 -0
- autogen/agentchat/contrib/captainagent/tools/math/calculate_reflected_point.py +16 -0
- autogen/agentchat/contrib/captainagent/tools/math/complex_numbers_product.py +25 -0
- autogen/agentchat/contrib/captainagent/tools/math/compute_currency_conversion.py +23 -0
- autogen/agentchat/contrib/captainagent/tools/math/count_distinct_permutations.py +27 -0
- autogen/agentchat/contrib/captainagent/tools/math/evaluate_expression.py +28 -0
- autogen/agentchat/contrib/captainagent/tools/math/find_continuity_point.py +34 -0
- autogen/agentchat/contrib/captainagent/tools/math/fraction_to_mixed_numbers.py +39 -0
- autogen/agentchat/contrib/captainagent/tools/math/modular_inverse_sum.py +23 -0
- autogen/agentchat/contrib/captainagent/tools/math/simplify_mixed_numbers.py +36 -0
- autogen/agentchat/contrib/captainagent/tools/math/sum_of_digit_factorials.py +15 -0
- autogen/agentchat/contrib/captainagent/tools/math/sum_of_primes_below.py +15 -0
- autogen/agentchat/contrib/captainagent/tools/requirements.txt +10 -0
- autogen/agentchat/contrib/captainagent/tools/tool_description.tsv +34 -0
- autogen/agentchat/contrib/gpt_assistant_agent.py +526 -0
- autogen/agentchat/contrib/graph_rag/__init__.py +9 -0
- autogen/agentchat/contrib/graph_rag/document.py +29 -0
- autogen/agentchat/contrib/graph_rag/falkor_graph_query_engine.py +170 -0
- autogen/agentchat/contrib/graph_rag/falkor_graph_rag_capability.py +103 -0
- autogen/agentchat/contrib/graph_rag/graph_query_engine.py +53 -0
- autogen/agentchat/contrib/graph_rag/graph_rag_capability.py +63 -0
- autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py +268 -0
- autogen/agentchat/contrib/graph_rag/neo4j_graph_rag_capability.py +83 -0
- autogen/agentchat/contrib/graph_rag/neo4j_native_graph_query_engine.py +210 -0
- autogen/agentchat/contrib/graph_rag/neo4j_native_graph_rag_capability.py +93 -0
- autogen/agentchat/contrib/img_utils.py +397 -0
- autogen/agentchat/contrib/llamaindex_conversable_agent.py +117 -0
- autogen/agentchat/contrib/llava_agent.py +187 -0
- autogen/agentchat/contrib/math_user_proxy_agent.py +464 -0
- autogen/agentchat/contrib/multimodal_conversable_agent.py +125 -0
- autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py +324 -0
- autogen/agentchat/contrib/rag/__init__.py +10 -0
- autogen/agentchat/contrib/rag/chromadb_query_engine.py +272 -0
- autogen/agentchat/contrib/rag/llamaindex_query_engine.py +198 -0
- autogen/agentchat/contrib/rag/mongodb_query_engine.py +329 -0
- autogen/agentchat/contrib/rag/query_engine.py +74 -0
- autogen/agentchat/contrib/retrieve_assistant_agent.py +56 -0
- autogen/agentchat/contrib/retrieve_user_proxy_agent.py +703 -0
- autogen/agentchat/contrib/society_of_mind_agent.py +199 -0
- autogen/agentchat/contrib/swarm_agent.py +1425 -0
- autogen/agentchat/contrib/text_analyzer_agent.py +79 -0
- autogen/agentchat/contrib/vectordb/__init__.py +5 -0
- autogen/agentchat/contrib/vectordb/base.py +232 -0
- autogen/agentchat/contrib/vectordb/chromadb.py +315 -0
- autogen/agentchat/contrib/vectordb/couchbase.py +407 -0
- autogen/agentchat/contrib/vectordb/mongodb.py +550 -0
- autogen/agentchat/contrib/vectordb/pgvectordb.py +928 -0
- autogen/agentchat/contrib/vectordb/qdrant.py +320 -0
- autogen/agentchat/contrib/vectordb/utils.py +126 -0
- autogen/agentchat/contrib/web_surfer.py +303 -0
- autogen/agentchat/conversable_agent.py +4023 -0
- autogen/agentchat/group/__init__.py +64 -0
- autogen/agentchat/group/available_condition.py +91 -0
- autogen/agentchat/group/context_condition.py +77 -0
- autogen/agentchat/group/context_expression.py +238 -0
- autogen/agentchat/group/context_str.py +41 -0
- autogen/agentchat/group/context_variables.py +192 -0
- autogen/agentchat/group/group_tool_executor.py +202 -0
- autogen/agentchat/group/group_utils.py +591 -0
- autogen/agentchat/group/handoffs.py +244 -0
- autogen/agentchat/group/llm_condition.py +93 -0
- autogen/agentchat/group/multi_agent_chat.py +237 -0
- autogen/agentchat/group/on_condition.py +58 -0
- autogen/agentchat/group/on_context_condition.py +54 -0
- autogen/agentchat/group/patterns/__init__.py +18 -0
- autogen/agentchat/group/patterns/auto.py +159 -0
- autogen/agentchat/group/patterns/manual.py +176 -0
- autogen/agentchat/group/patterns/pattern.py +288 -0
- autogen/agentchat/group/patterns/random.py +106 -0
- autogen/agentchat/group/patterns/round_robin.py +117 -0
- autogen/agentchat/group/reply_result.py +26 -0
- autogen/agentchat/group/speaker_selection_result.py +41 -0
- autogen/agentchat/group/targets/__init__.py +4 -0
- autogen/agentchat/group/targets/group_chat_target.py +132 -0
- autogen/agentchat/group/targets/group_manager_target.py +151 -0
- autogen/agentchat/group/targets/transition_target.py +413 -0
- autogen/agentchat/group/targets/transition_utils.py +6 -0
- autogen/agentchat/groupchat.py +1694 -0
- autogen/agentchat/realtime/__init__.py +3 -0
- autogen/agentchat/realtime/experimental/__init__.py +20 -0
- autogen/agentchat/realtime/experimental/audio_adapters/__init__.py +8 -0
- autogen/agentchat/realtime/experimental/audio_adapters/twilio_audio_adapter.py +148 -0
- autogen/agentchat/realtime/experimental/audio_adapters/websocket_audio_adapter.py +139 -0
- autogen/agentchat/realtime/experimental/audio_observer.py +42 -0
- autogen/agentchat/realtime/experimental/clients/__init__.py +15 -0
- autogen/agentchat/realtime/experimental/clients/gemini/__init__.py +7 -0
- autogen/agentchat/realtime/experimental/clients/gemini/client.py +274 -0
- autogen/agentchat/realtime/experimental/clients/oai/__init__.py +8 -0
- autogen/agentchat/realtime/experimental/clients/oai/base_client.py +220 -0
- autogen/agentchat/realtime/experimental/clients/oai/rtc_client.py +243 -0
- autogen/agentchat/realtime/experimental/clients/oai/utils.py +48 -0
- autogen/agentchat/realtime/experimental/clients/realtime_client.py +190 -0
- autogen/agentchat/realtime/experimental/function_observer.py +85 -0
- autogen/agentchat/realtime/experimental/realtime_agent.py +158 -0
- autogen/agentchat/realtime/experimental/realtime_events.py +42 -0
- autogen/agentchat/realtime/experimental/realtime_observer.py +100 -0
- autogen/agentchat/realtime/experimental/realtime_swarm.py +475 -0
- autogen/agentchat/realtime/experimental/websockets.py +21 -0
- autogen/agentchat/realtime_agent/__init__.py +21 -0
- autogen/agentchat/user_proxy_agent.py +111 -0
- autogen/agentchat/utils.py +206 -0
- autogen/agents/__init__.py +3 -0
- autogen/agents/contrib/__init__.py +10 -0
- autogen/agents/contrib/time/__init__.py +8 -0
- autogen/agents/contrib/time/time_reply_agent.py +73 -0
- autogen/agents/contrib/time/time_tool_agent.py +51 -0
- autogen/agents/experimental/__init__.py +27 -0
- autogen/agents/experimental/deep_research/__init__.py +7 -0
- autogen/agents/experimental/deep_research/deep_research.py +52 -0
- autogen/agents/experimental/discord/__init__.py +7 -0
- autogen/agents/experimental/discord/discord.py +66 -0
- autogen/agents/experimental/document_agent/__init__.py +19 -0
- autogen/agents/experimental/document_agent/chroma_query_engine.py +316 -0
- autogen/agents/experimental/document_agent/docling_doc_ingest_agent.py +118 -0
- autogen/agents/experimental/document_agent/document_agent.py +461 -0
- autogen/agents/experimental/document_agent/document_conditions.py +50 -0
- autogen/agents/experimental/document_agent/document_utils.py +380 -0
- autogen/agents/experimental/document_agent/inmemory_query_engine.py +220 -0
- autogen/agents/experimental/document_agent/parser_utils.py +130 -0
- autogen/agents/experimental/document_agent/url_utils.py +426 -0
- autogen/agents/experimental/reasoning/__init__.py +7 -0
- autogen/agents/experimental/reasoning/reasoning_agent.py +1178 -0
- autogen/agents/experimental/slack/__init__.py +7 -0
- autogen/agents/experimental/slack/slack.py +73 -0
- autogen/agents/experimental/telegram/__init__.py +7 -0
- autogen/agents/experimental/telegram/telegram.py +77 -0
- autogen/agents/experimental/websurfer/__init__.py +7 -0
- autogen/agents/experimental/websurfer/websurfer.py +62 -0
- autogen/agents/experimental/wikipedia/__init__.py +7 -0
- autogen/agents/experimental/wikipedia/wikipedia.py +90 -0
- autogen/browser_utils.py +309 -0
- autogen/cache/__init__.py +10 -0
- autogen/cache/abstract_cache_base.py +75 -0
- autogen/cache/cache.py +203 -0
- autogen/cache/cache_factory.py +88 -0
- autogen/cache/cosmos_db_cache.py +144 -0
- autogen/cache/disk_cache.py +102 -0
- autogen/cache/in_memory_cache.py +58 -0
- autogen/cache/redis_cache.py +123 -0
- autogen/code_utils.py +596 -0
- autogen/coding/__init__.py +22 -0
- autogen/coding/base.py +119 -0
- autogen/coding/docker_commandline_code_executor.py +268 -0
- autogen/coding/factory.py +47 -0
- autogen/coding/func_with_reqs.py +202 -0
- autogen/coding/jupyter/__init__.py +23 -0
- autogen/coding/jupyter/base.py +36 -0
- autogen/coding/jupyter/docker_jupyter_server.py +167 -0
- autogen/coding/jupyter/embedded_ipython_code_executor.py +182 -0
- autogen/coding/jupyter/import_utils.py +82 -0
- autogen/coding/jupyter/jupyter_client.py +231 -0
- autogen/coding/jupyter/jupyter_code_executor.py +160 -0
- autogen/coding/jupyter/local_jupyter_server.py +172 -0
- autogen/coding/local_commandline_code_executor.py +405 -0
- autogen/coding/markdown_code_extractor.py +45 -0
- autogen/coding/utils.py +56 -0
- autogen/doc_utils.py +34 -0
- autogen/events/__init__.py +7 -0
- autogen/events/agent_events.py +1013 -0
- autogen/events/base_event.py +99 -0
- autogen/events/client_events.py +167 -0
- autogen/events/helpers.py +36 -0
- autogen/events/print_event.py +46 -0
- autogen/exception_utils.py +73 -0
- autogen/extensions/__init__.py +5 -0
- autogen/fast_depends/__init__.py +16 -0
- autogen/fast_depends/_compat.py +80 -0
- autogen/fast_depends/core/__init__.py +14 -0
- autogen/fast_depends/core/build.py +225 -0
- autogen/fast_depends/core/model.py +576 -0
- autogen/fast_depends/dependencies/__init__.py +15 -0
- autogen/fast_depends/dependencies/model.py +29 -0
- autogen/fast_depends/dependencies/provider.py +39 -0
- autogen/fast_depends/library/__init__.py +10 -0
- autogen/fast_depends/library/model.py +46 -0
- autogen/fast_depends/py.typed +6 -0
- autogen/fast_depends/schema.py +66 -0
- autogen/fast_depends/use.py +280 -0
- autogen/fast_depends/utils.py +187 -0
- autogen/formatting_utils.py +83 -0
- autogen/function_utils.py +13 -0
- autogen/graph_utils.py +178 -0
- autogen/import_utils.py +526 -0
- autogen/interop/__init__.py +22 -0
- autogen/interop/crewai/__init__.py +7 -0
- autogen/interop/crewai/crewai.py +88 -0
- autogen/interop/interoperability.py +71 -0
- autogen/interop/interoperable.py +46 -0
- autogen/interop/langchain/__init__.py +8 -0
- autogen/interop/langchain/langchain_chat_model_factory.py +155 -0
- autogen/interop/langchain/langchain_tool.py +82 -0
- autogen/interop/litellm/__init__.py +7 -0
- autogen/interop/litellm/litellm_config_factory.py +179 -0
- autogen/interop/pydantic_ai/__init__.py +7 -0
- autogen/interop/pydantic_ai/pydantic_ai.py +168 -0
- autogen/interop/registry.py +69 -0
- autogen/io/__init__.py +15 -0
- autogen/io/base.py +151 -0
- autogen/io/console.py +56 -0
- autogen/io/processors/__init__.py +12 -0
- autogen/io/processors/base.py +21 -0
- autogen/io/processors/console_event_processor.py +56 -0
- autogen/io/run_response.py +293 -0
- autogen/io/thread_io_stream.py +63 -0
- autogen/io/websockets.py +213 -0
- autogen/json_utils.py +43 -0
- autogen/llm_config.py +382 -0
- autogen/logger/__init__.py +11 -0
- autogen/logger/base_logger.py +128 -0
- autogen/logger/file_logger.py +261 -0
- autogen/logger/logger_factory.py +42 -0
- autogen/logger/logger_utils.py +57 -0
- autogen/logger/sqlite_logger.py +523 -0
- autogen/math_utils.py +339 -0
- autogen/mcp/__init__.py +7 -0
- autogen/mcp/__main__.py +78 -0
- autogen/mcp/mcp_client.py +208 -0
- autogen/mcp/mcp_proxy/__init__.py +19 -0
- autogen/mcp/mcp_proxy/fastapi_code_generator_helpers.py +63 -0
- autogen/mcp/mcp_proxy/mcp_proxy.py +581 -0
- autogen/mcp/mcp_proxy/operation_grouping.py +158 -0
- autogen/mcp/mcp_proxy/operation_renaming.py +114 -0
- autogen/mcp/mcp_proxy/patch_fastapi_code_generator.py +98 -0
- autogen/mcp/mcp_proxy/security.py +400 -0
- autogen/mcp/mcp_proxy/security_schema_visitor.py +37 -0
- autogen/messages/__init__.py +7 -0
- autogen/messages/agent_messages.py +948 -0
- autogen/messages/base_message.py +107 -0
- autogen/messages/client_messages.py +171 -0
- autogen/messages/print_message.py +49 -0
- autogen/oai/__init__.py +53 -0
- autogen/oai/anthropic.py +714 -0
- autogen/oai/bedrock.py +628 -0
- autogen/oai/cerebras.py +299 -0
- autogen/oai/client.py +1444 -0
- autogen/oai/client_utils.py +169 -0
- autogen/oai/cohere.py +479 -0
- autogen/oai/gemini.py +998 -0
- autogen/oai/gemini_types.py +155 -0
- autogen/oai/groq.py +305 -0
- autogen/oai/mistral.py +303 -0
- autogen/oai/oai_models/__init__.py +11 -0
- autogen/oai/oai_models/_models.py +16 -0
- autogen/oai/oai_models/chat_completion.py +87 -0
- autogen/oai/oai_models/chat_completion_audio.py +32 -0
- autogen/oai/oai_models/chat_completion_message.py +86 -0
- autogen/oai/oai_models/chat_completion_message_tool_call.py +37 -0
- autogen/oai/oai_models/chat_completion_token_logprob.py +63 -0
- autogen/oai/oai_models/completion_usage.py +60 -0
- autogen/oai/ollama.py +643 -0
- autogen/oai/openai_utils.py +881 -0
- autogen/oai/together.py +370 -0
- autogen/retrieve_utils.py +491 -0
- autogen/runtime_logging.py +160 -0
- autogen/token_count_utils.py +267 -0
- autogen/tools/__init__.py +20 -0
- autogen/tools/contrib/__init__.py +9 -0
- autogen/tools/contrib/time/__init__.py +7 -0
- autogen/tools/contrib/time/time.py +41 -0
- autogen/tools/dependency_injection.py +254 -0
- autogen/tools/experimental/__init__.py +48 -0
- autogen/tools/experimental/browser_use/__init__.py +7 -0
- autogen/tools/experimental/browser_use/browser_use.py +161 -0
- autogen/tools/experimental/crawl4ai/__init__.py +7 -0
- autogen/tools/experimental/crawl4ai/crawl4ai.py +153 -0
- autogen/tools/experimental/deep_research/__init__.py +7 -0
- autogen/tools/experimental/deep_research/deep_research.py +328 -0
- autogen/tools/experimental/duckduckgo/__init__.py +7 -0
- autogen/tools/experimental/duckduckgo/duckduckgo_search.py +109 -0
- autogen/tools/experimental/google/__init__.py +14 -0
- autogen/tools/experimental/google/authentication/__init__.py +11 -0
- autogen/tools/experimental/google/authentication/credentials_hosted_provider.py +43 -0
- autogen/tools/experimental/google/authentication/credentials_local_provider.py +91 -0
- autogen/tools/experimental/google/authentication/credentials_provider.py +35 -0
- autogen/tools/experimental/google/drive/__init__.py +9 -0
- autogen/tools/experimental/google/drive/drive_functions.py +124 -0
- autogen/tools/experimental/google/drive/toolkit.py +88 -0
- autogen/tools/experimental/google/model.py +17 -0
- autogen/tools/experimental/google/toolkit_protocol.py +19 -0
- autogen/tools/experimental/google_search/__init__.py +8 -0
- autogen/tools/experimental/google_search/google_search.py +93 -0
- autogen/tools/experimental/google_search/youtube_search.py +181 -0
- autogen/tools/experimental/messageplatform/__init__.py +17 -0
- autogen/tools/experimental/messageplatform/discord/__init__.py +7 -0
- autogen/tools/experimental/messageplatform/discord/discord.py +288 -0
- autogen/tools/experimental/messageplatform/slack/__init__.py +7 -0
- autogen/tools/experimental/messageplatform/slack/slack.py +391 -0
- autogen/tools/experimental/messageplatform/telegram/__init__.py +7 -0
- autogen/tools/experimental/messageplatform/telegram/telegram.py +275 -0
- autogen/tools/experimental/perplexity/__init__.py +7 -0
- autogen/tools/experimental/perplexity/perplexity_search.py +260 -0
- autogen/tools/experimental/reliable/__init__.py +10 -0
- autogen/tools/experimental/reliable/reliable.py +1316 -0
- autogen/tools/experimental/tavily/__init__.py +7 -0
- autogen/tools/experimental/tavily/tavily_search.py +183 -0
- autogen/tools/experimental/web_search_preview/__init__.py +7 -0
- autogen/tools/experimental/web_search_preview/web_search_preview.py +114 -0
- autogen/tools/experimental/wikipedia/__init__.py +7 -0
- autogen/tools/experimental/wikipedia/wikipedia.py +287 -0
- autogen/tools/function_utils.py +411 -0
- autogen/tools/tool.py +187 -0
- autogen/tools/toolkit.py +86 -0
- autogen/types.py +29 -0
- autogen/version.py +7 -0
- templates/client_template/main.jinja2 +69 -0
- templates/config_template/config.jinja2 +7 -0
- templates/main.jinja2 +61 -0
- ag2-0.9.1a1.dist-info/RECORD +0 -6
- ag2-0.9.1a1.dist-info/top_level.txt +0 -1
- {ag2-0.9.1a1.dist-info → ag2-0.9.2.dist-info/licenses}/LICENSE +0 -0
- {ag2-0.9.1a1.dist-info → ag2-0.9.2.dist-info/licenses}/NOTICE.md +0 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Annotated, Optional, Union
|
|
11
|
+
|
|
12
|
+
from ....doc_utils import export_module
|
|
13
|
+
from ....import_utils import optional_import_block, require_optional_import
|
|
14
|
+
from .document_utils import handle_input
|
|
15
|
+
|
|
16
|
+
with optional_import_block():
|
|
17
|
+
from docling.datamodel.base_models import InputFormat
|
|
18
|
+
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions
|
|
19
|
+
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
20
|
+
|
|
21
|
+
__all__ = ["docling_parse_docs"]
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
logger.setLevel(logging.DEBUG)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@require_optional_import(["docling"], "rag")
|
|
28
|
+
@export_module("autogen.agents.experimental.document_agent")
|
|
29
|
+
def docling_parse_docs( # type: ignore[no-any-unimported]
|
|
30
|
+
input_file_path: Annotated[Union[Path, str], "Path to the input file or directory"],
|
|
31
|
+
output_dir_path: Annotated[Optional[Union[Path, str]], "Path to the output directory"] = None,
|
|
32
|
+
output_formats: Annotated[Optional[list[str]], "List of output formats (markdown, json)"] = None,
|
|
33
|
+
table_output_format: str = "html",
|
|
34
|
+
) -> list[Path]:
|
|
35
|
+
"""Convert documents into a Deep Search document format using EasyOCR
|
|
36
|
+
with CPU only, and export the document and its tables to the specified
|
|
37
|
+
output directory.
|
|
38
|
+
|
|
39
|
+
Supported formats:
|
|
40
|
+
PDF,
|
|
41
|
+
IMAGE,
|
|
42
|
+
DOCX,
|
|
43
|
+
HTML,
|
|
44
|
+
PPTX,
|
|
45
|
+
ASCIIDOC,
|
|
46
|
+
MD,
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
input_file_path (Union[Path, str]): The path to the input file.
|
|
50
|
+
output_dir_path (Union[Path, str]): The path to the output directory.
|
|
51
|
+
output_formats (list[str], optional): The output formats. Defaults to ["markdown"].
|
|
52
|
+
table_output_format (str, optional): The output format for tables. Defaults to "html".
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
list[ConversionResult]: The result of the conversion.
|
|
56
|
+
"""
|
|
57
|
+
output_dir_path = output_dir_path or (Path.cwd() / "output")
|
|
58
|
+
output_dir_path = Path(output_dir_path).resolve()
|
|
59
|
+
output_dir_path.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
# ToDo: For some reason, output_dir_path.mkdir is not creating the directory.
|
|
61
|
+
# This is a workaround to create the directory if it does not exist.
|
|
62
|
+
# Following test is failing without this workaround:
|
|
63
|
+
# test/agents/experimental/document_agent/test_parser_utils.py::TestDoclingParseDocs::test_default_output_dir_path
|
|
64
|
+
if not os.path.exists(output_dir_path):
|
|
65
|
+
os.makedirs(output_dir_path)
|
|
66
|
+
output_formats = output_formats or ["markdown"]
|
|
67
|
+
|
|
68
|
+
input_doc_paths: list[Path] = handle_input(input_file_path, output_dir=output_dir_path)
|
|
69
|
+
|
|
70
|
+
if not input_doc_paths:
|
|
71
|
+
raise ValueError("No documents found.")
|
|
72
|
+
|
|
73
|
+
# Docling Parse PDF with EasyOCR (CPU only)
|
|
74
|
+
# ----------------------
|
|
75
|
+
pdf_pipeline_options = PdfPipelineOptions()
|
|
76
|
+
pdf_pipeline_options.do_ocr = True
|
|
77
|
+
if hasattr(pdf_pipeline_options.ocr_options, "use_gpu"):
|
|
78
|
+
pdf_pipeline_options.ocr_options.use_gpu = False # <-- set this.
|
|
79
|
+
pdf_pipeline_options.do_table_structure = True
|
|
80
|
+
pdf_pipeline_options.table_structure_options.do_cell_matching = True
|
|
81
|
+
pdf_pipeline_options.ocr_options.lang = ["en"]
|
|
82
|
+
pdf_pipeline_options.accelerator_options = AcceleratorOptions(num_threads=4, device=AcceleratorDevice.AUTO)
|
|
83
|
+
|
|
84
|
+
doc_converter = DocumentConverter(
|
|
85
|
+
format_options={
|
|
86
|
+
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
conv_results = list(doc_converter.convert_all(input_doc_paths))
|
|
92
|
+
end_time = time.time() - start_time
|
|
93
|
+
|
|
94
|
+
logger.info(f"Document converted in {end_time:.2f} seconds.")
|
|
95
|
+
|
|
96
|
+
# Export results
|
|
97
|
+
output_dir = Path(output_dir_path).resolve()
|
|
98
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
|
|
100
|
+
conv_files = []
|
|
101
|
+
|
|
102
|
+
for res in conv_results:
|
|
103
|
+
out_path = Path(output_dir_path).resolve()
|
|
104
|
+
doc_filename = res.input.file.stem
|
|
105
|
+
logger.debug(f"Document {res.input.file.name} converted.\nSaved markdown output to: {out_path!s}")
|
|
106
|
+
logger.debug(res.document._export_to_indented_text(max_text_len=16))
|
|
107
|
+
|
|
108
|
+
if "markdown" in output_formats:
|
|
109
|
+
# Export Docling document format to markdown:
|
|
110
|
+
output_file = out_path / f"{doc_filename}.md"
|
|
111
|
+
with output_file.open("w") as fp:
|
|
112
|
+
fp.write(res.document.export_to_markdown())
|
|
113
|
+
conv_files.append(output_file)
|
|
114
|
+
|
|
115
|
+
if "json" in output_formats:
|
|
116
|
+
# Export Docling document format to json
|
|
117
|
+
output_file = out_path / f"{doc_filename}.json"
|
|
118
|
+
with output_file.open("w") as fp:
|
|
119
|
+
fp.write(json.dumps(res.document.export_to_dict()))
|
|
120
|
+
conv_files.append(output_file)
|
|
121
|
+
|
|
122
|
+
# Export tables (used for evaluating conversion)
|
|
123
|
+
if table_output_format == "html":
|
|
124
|
+
for table_ix, table in enumerate(res.document.tables):
|
|
125
|
+
# Save the table as html
|
|
126
|
+
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
|
|
127
|
+
logger.debug(f"Saving HTML table to {element_html_filename}")
|
|
128
|
+
with element_html_filename.open("w") as fp:
|
|
129
|
+
fp.write(table.export_to_html())
|
|
130
|
+
return conv_files
|
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
# Copyright (c) 2023 - 2025, AG2ai, Inc., AG2ai open-source projects maintainers and core contributors
|
|
2
|
+
#
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Optional, Tuple
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
from ....import_utils import optional_import_block, require_optional_import
|
|
10
|
+
|
|
11
|
+
with optional_import_block():
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class InputFormat(Enum):
|
|
16
|
+
"""Enum representing supported input file formats."""
|
|
17
|
+
|
|
18
|
+
DOCX = "docx"
|
|
19
|
+
PPTX = "pptx"
|
|
20
|
+
HTML = "html"
|
|
21
|
+
XML = "xml"
|
|
22
|
+
IMAGE = "image"
|
|
23
|
+
PDF = "pdf"
|
|
24
|
+
ASCIIDOC = "asciidoc"
|
|
25
|
+
MD = "md"
|
|
26
|
+
CSV = "csv"
|
|
27
|
+
XLSX = "xlsx"
|
|
28
|
+
JSON = "json"
|
|
29
|
+
INVALID = "invalid" # Server errors or not a URL
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Map common file extensions to InputFormat
|
|
33
|
+
# INVALID means it's not supported
|
|
34
|
+
# See: https://github.com/DS4SD/docling/blob/e25d557c06afd77f1bb2c1ac4d2ece4dffcd52bd/docling/datamodel/base_models.py#L56
|
|
35
|
+
ExtensionToFormat = {
|
|
36
|
+
# DOCX formats
|
|
37
|
+
"docx": InputFormat.DOCX,
|
|
38
|
+
"dotx": InputFormat.DOCX,
|
|
39
|
+
"docm": InputFormat.DOCX,
|
|
40
|
+
"dotm": InputFormat.DOCX,
|
|
41
|
+
# PPTX formats
|
|
42
|
+
"pptx": InputFormat.PPTX,
|
|
43
|
+
"potx": InputFormat.PPTX,
|
|
44
|
+
"ppsx": InputFormat.PPTX,
|
|
45
|
+
"pptm": InputFormat.PPTX,
|
|
46
|
+
"potm": InputFormat.PPTX,
|
|
47
|
+
"ppsm": InputFormat.PPTX,
|
|
48
|
+
# Excel formats
|
|
49
|
+
"xlsx": InputFormat.XLSX,
|
|
50
|
+
# HTML formats
|
|
51
|
+
"html": InputFormat.HTML,
|
|
52
|
+
"htm": InputFormat.HTML,
|
|
53
|
+
"xhtml": InputFormat.HTML,
|
|
54
|
+
# XML formats
|
|
55
|
+
"xml": InputFormat.XML,
|
|
56
|
+
"nxml": InputFormat.XML,
|
|
57
|
+
"txt": InputFormat.XML, # Note: .txt could be many formats, XML is just one possibility
|
|
58
|
+
# Image formats
|
|
59
|
+
"png": InputFormat.IMAGE,
|
|
60
|
+
"jpg": InputFormat.IMAGE,
|
|
61
|
+
"jpeg": InputFormat.IMAGE,
|
|
62
|
+
"tiff": InputFormat.IMAGE,
|
|
63
|
+
"tif": InputFormat.IMAGE,
|
|
64
|
+
"bmp": InputFormat.IMAGE,
|
|
65
|
+
# PDF format
|
|
66
|
+
"pdf": InputFormat.PDF,
|
|
67
|
+
# AsciiDoc formats
|
|
68
|
+
"adoc": InputFormat.ASCIIDOC,
|
|
69
|
+
"asciidoc": InputFormat.ASCIIDOC,
|
|
70
|
+
"asc": InputFormat.ASCIIDOC,
|
|
71
|
+
# Markdown formats
|
|
72
|
+
"md": InputFormat.MD,
|
|
73
|
+
"markdown": InputFormat.MD,
|
|
74
|
+
# CSV format
|
|
75
|
+
"csv": InputFormat.CSV,
|
|
76
|
+
# JSON format
|
|
77
|
+
"json": InputFormat.JSON,
|
|
78
|
+
# Unsupported formats
|
|
79
|
+
"doc": InputFormat.INVALID,
|
|
80
|
+
"ppt": InputFormat.INVALID,
|
|
81
|
+
"xls": InputFormat.INVALID,
|
|
82
|
+
"gif": InputFormat.INVALID,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class URLAnalyzer:
|
|
87
|
+
"""
|
|
88
|
+
A class that analyzes URLs to determine if they point to web pages or files.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
# Mapping of input formats to their corresponding MIME types
|
|
92
|
+
FormatToMimeType: dict[InputFormat, list[str]] = {
|
|
93
|
+
InputFormat.DOCX: [
|
|
94
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
95
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
|
96
|
+
],
|
|
97
|
+
InputFormat.PPTX: [
|
|
98
|
+
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
99
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
100
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
101
|
+
],
|
|
102
|
+
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
|
103
|
+
InputFormat.XML: ["application/xml", "text/xml", "text/plain"],
|
|
104
|
+
InputFormat.IMAGE: [
|
|
105
|
+
"image/png",
|
|
106
|
+
"image/jpeg",
|
|
107
|
+
"image/tiff",
|
|
108
|
+
"image/gif",
|
|
109
|
+
"image/bmp",
|
|
110
|
+
],
|
|
111
|
+
InputFormat.PDF: ["application/pdf"],
|
|
112
|
+
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
|
113
|
+
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
|
114
|
+
InputFormat.CSV: ["text/csv"],
|
|
115
|
+
InputFormat.XLSX: ["application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"],
|
|
116
|
+
InputFormat.JSON: ["application/json"],
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
# Create a reverse mapping from MIME types to formats
|
|
120
|
+
# Note: For ambiguous MIME types (like "application/xml"), we'll favor the first format found
|
|
121
|
+
MimeTypeToFormat = {}
|
|
122
|
+
for format_type, mime_types in FormatToMimeType.items():
|
|
123
|
+
for mime_type in mime_types:
|
|
124
|
+
if mime_type not in MimeTypeToFormat:
|
|
125
|
+
MimeTypeToFormat[mime_type] = format_type
|
|
126
|
+
|
|
127
|
+
def __init__(self, url: str):
|
|
128
|
+
"""
|
|
129
|
+
Initialize the URLAnalyzer with a URL.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
url (str): The URL to analyze
|
|
133
|
+
"""
|
|
134
|
+
self.url = url
|
|
135
|
+
self.analysis_result: Optional[dict[str, Any]] = None
|
|
136
|
+
self.final_url: Optional[str] = None
|
|
137
|
+
self.redirect_chain: list[str] = []
|
|
138
|
+
|
|
139
|
+
def analyze(
|
|
140
|
+
self, test_url: bool = False, follow_redirects: bool = True, prioritize_extension: bool = True
|
|
141
|
+
) -> dict[str, Any]:
|
|
142
|
+
"""
|
|
143
|
+
Analyze the URL to determine if it points to a web page or a file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
test_url (bool): Whether to test the URL by making a request
|
|
147
|
+
follow_redirects (bool): Whether to follow redirects when testing the URL
|
|
148
|
+
prioritize_extension (bool): Whether to prioritize file extension over MIME type
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
dict: A dictionary containing the analysis results
|
|
152
|
+
"""
|
|
153
|
+
result = {
|
|
154
|
+
"url": self.url,
|
|
155
|
+
"is_file": False,
|
|
156
|
+
"file_type": None,
|
|
157
|
+
"mime_type": None,
|
|
158
|
+
"method": "extension_analysis",
|
|
159
|
+
"redirects": False,
|
|
160
|
+
"redirect_count": 0,
|
|
161
|
+
"final_url": self.url,
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
# First try to analyze based on the URL extension
|
|
165
|
+
extension_analysis = self._analyze_by_extension(self.url)
|
|
166
|
+
if extension_analysis["is_file"]:
|
|
167
|
+
result.update(extension_analysis)
|
|
168
|
+
|
|
169
|
+
# If test_url is True, make a request
|
|
170
|
+
if test_url:
|
|
171
|
+
request_analysis = self._analyze_by_request(follow_redirects)
|
|
172
|
+
if request_analysis:
|
|
173
|
+
# Update the redirect information
|
|
174
|
+
if self.final_url and self.final_url != self.url:
|
|
175
|
+
result["redirects"] = True
|
|
176
|
+
result["redirect_count"] = len(self.redirect_chain)
|
|
177
|
+
result["redirect_chain"] = self.redirect_chain
|
|
178
|
+
result["final_url"] = self.final_url
|
|
179
|
+
|
|
180
|
+
# Re-analyze based on the final URL extension
|
|
181
|
+
if self.final_url != self.url:
|
|
182
|
+
final_extension_analysis = self._analyze_by_extension(self.final_url)
|
|
183
|
+
if final_extension_analysis["is_file"]:
|
|
184
|
+
# If prioritizing extension and we have a file extension match
|
|
185
|
+
if prioritize_extension:
|
|
186
|
+
# Keep the MIME type from the request but use file type from extension
|
|
187
|
+
mime_type = request_analysis.get("mime_type")
|
|
188
|
+
request_analysis.update(final_extension_analysis)
|
|
189
|
+
if mime_type:
|
|
190
|
+
request_analysis["mime_type"] = mime_type
|
|
191
|
+
else:
|
|
192
|
+
# Only use extension analysis if request didn't identify a file type
|
|
193
|
+
if not request_analysis.get("file_type"):
|
|
194
|
+
request_analysis.update(final_extension_analysis)
|
|
195
|
+
|
|
196
|
+
# If prioritize_extension is True and we have both extension and MIME type analyses
|
|
197
|
+
if (
|
|
198
|
+
prioritize_extension
|
|
199
|
+
and result.get("extension")
|
|
200
|
+
and result.get("file_type")
|
|
201
|
+
and request_analysis.get("mime_type")
|
|
202
|
+
):
|
|
203
|
+
# Keep the extension-based file type but add the MIME type from the request
|
|
204
|
+
request_analysis["file_type"] = result["file_type"]
|
|
205
|
+
request_analysis["is_file"] = True
|
|
206
|
+
|
|
207
|
+
result.update(request_analysis)
|
|
208
|
+
result["method"] = "request_analysis"
|
|
209
|
+
|
|
210
|
+
# Store the result for later access
|
|
211
|
+
self.analysis_result = result
|
|
212
|
+
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
def _analyze_by_extension(self, url: str) -> dict[str, Any]:
|
|
216
|
+
"""
|
|
217
|
+
Analyze URL based on its file extension.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
url (str): The URL to analyze
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
dict: Analysis results based on the file extension
|
|
224
|
+
"""
|
|
225
|
+
parsed_url = urlparse(url)
|
|
226
|
+
path = parsed_url.path.lower()
|
|
227
|
+
|
|
228
|
+
# Check if the URL has a file extension
|
|
229
|
+
if "." in path:
|
|
230
|
+
extension = path.split(".")[-1]
|
|
231
|
+
|
|
232
|
+
# Check if it's a known file extension
|
|
233
|
+
if extension in ExtensionToFormat:
|
|
234
|
+
format_type = ExtensionToFormat[extension]
|
|
235
|
+
return {
|
|
236
|
+
"is_file": True,
|
|
237
|
+
"file_type": format_type,
|
|
238
|
+
"extension": extension,
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
# If no file extension was found or it's not recognized,
|
|
242
|
+
# assume it's a web page (but this could be confirmed with a request)
|
|
243
|
+
return {
|
|
244
|
+
"is_file": False,
|
|
245
|
+
"file_type": None,
|
|
246
|
+
"extension": None,
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
@require_optional_import(["requests"], "rag")
|
|
250
|
+
def _analyze_by_request(self, follow_redirects: bool = True) -> Optional[dict[str, Any]]:
|
|
251
|
+
"""
|
|
252
|
+
Analyze URL by making a HEAD request to check Content-Type.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
follow_redirects (bool): Whether to follow redirects
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Optional[dict]: Analysis results based on the HTTP response or None if the request failed
|
|
259
|
+
"""
|
|
260
|
+
try:
|
|
261
|
+
# Store redirect history
|
|
262
|
+
self.redirect_chain = []
|
|
263
|
+
|
|
264
|
+
# First try a HEAD request (faster but some servers don't handle it well)
|
|
265
|
+
response = requests.head(self.url, allow_redirects=follow_redirects, timeout=5)
|
|
266
|
+
|
|
267
|
+
# If the server returns a 405 (Method Not Allowed) for HEAD, try GET
|
|
268
|
+
if response.status_code == 405:
|
|
269
|
+
response = requests.get(self.url, allow_redirects=follow_redirects, timeout=5, stream=True)
|
|
270
|
+
# Close the connection without downloading the content
|
|
271
|
+
response.close()
|
|
272
|
+
|
|
273
|
+
# Check for non-success status codes
|
|
274
|
+
if response.status_code >= 400:
|
|
275
|
+
return {
|
|
276
|
+
"is_file": False,
|
|
277
|
+
"file_type": InputFormat.INVALID,
|
|
278
|
+
"mime_type": None,
|
|
279
|
+
"error": f"HTTP error: {response.status_code}",
|
|
280
|
+
"status_code": response.status_code,
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
# Store information about redirects
|
|
284
|
+
if hasattr(response, "history") and response.history:
|
|
285
|
+
self.redirect_chain = [r.url for r in response.history]
|
|
286
|
+
self.final_url = response.url
|
|
287
|
+
else:
|
|
288
|
+
self.final_url = self.url
|
|
289
|
+
|
|
290
|
+
# Get the Content-Type header
|
|
291
|
+
content_type = response.headers.get("Content-Type", "").split(";")[0].strip()
|
|
292
|
+
|
|
293
|
+
# Check if it matches any of our known MIME types
|
|
294
|
+
format_type = self.MimeTypeToFormat.get(content_type)
|
|
295
|
+
|
|
296
|
+
# Handle different content types
|
|
297
|
+
if format_type:
|
|
298
|
+
return {
|
|
299
|
+
"is_file": True,
|
|
300
|
+
"file_type": format_type,
|
|
301
|
+
"mime_type": content_type,
|
|
302
|
+
}
|
|
303
|
+
elif content_type in ["text/html", "application/xhtml+xml"]:
|
|
304
|
+
return {
|
|
305
|
+
"is_file": False,
|
|
306
|
+
"file_type": None,
|
|
307
|
+
"mime_type": content_type,
|
|
308
|
+
}
|
|
309
|
+
else:
|
|
310
|
+
# Content type was found but not in our mapping
|
|
311
|
+
return {
|
|
312
|
+
"is_file": True,
|
|
313
|
+
"file_type": "unknown",
|
|
314
|
+
"mime_type": content_type,
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
except requests.exceptions.TooManyRedirects:
|
|
318
|
+
# Handle redirect loops or too many redirects
|
|
319
|
+
return {
|
|
320
|
+
"is_file": False,
|
|
321
|
+
"file_type": InputFormat.INVALID,
|
|
322
|
+
"mime_type": None,
|
|
323
|
+
"error": "Too many redirects",
|
|
324
|
+
"redirects": True,
|
|
325
|
+
}
|
|
326
|
+
except requests.exceptions.ConnectionError:
|
|
327
|
+
# Handle connection errors (e.g., DNS failure, refused connection)
|
|
328
|
+
return {
|
|
329
|
+
"is_file": False,
|
|
330
|
+
"file_type": InputFormat.INVALID,
|
|
331
|
+
"mime_type": None,
|
|
332
|
+
"error": "Connection error - URL may be invalid or server unavailable",
|
|
333
|
+
}
|
|
334
|
+
except requests.exceptions.Timeout:
|
|
335
|
+
# Handle timeout
|
|
336
|
+
return {"is_file": False, "file_type": InputFormat.INVALID, "mime_type": None, "error": "Request timed out"}
|
|
337
|
+
except requests.exceptions.InvalidURL:
|
|
338
|
+
# Handle invalid URL
|
|
339
|
+
return {
|
|
340
|
+
"is_file": False,
|
|
341
|
+
"file_type": InputFormat.INVALID,
|
|
342
|
+
"mime_type": None,
|
|
343
|
+
"error": "Invalid URL format",
|
|
344
|
+
}
|
|
345
|
+
except Exception as e:
|
|
346
|
+
# If the request fails for any other reason
|
|
347
|
+
return {"is_file": False, "file_type": InputFormat.INVALID, "mime_type": None, "error": str(e)}
|
|
348
|
+
|
|
349
|
+
def get_result(self) -> Optional[dict[str, Any]]:
|
|
350
|
+
"""
|
|
351
|
+
Get the last analysis result, or None if the URL hasn't been analyzed yet.
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
Optional[dict]: The analysis result or None
|
|
355
|
+
"""
|
|
356
|
+
return self.analysis_result
|
|
357
|
+
|
|
358
|
+
def get_redirect_info(self) -> dict[str, Any]:
|
|
359
|
+
"""
|
|
360
|
+
Get information about redirects that occurred during the last request.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
dict: Information about redirects
|
|
364
|
+
"""
|
|
365
|
+
if not self.final_url:
|
|
366
|
+
return {
|
|
367
|
+
"redirects": False,
|
|
368
|
+
"redirect_count": 0,
|
|
369
|
+
"original_url": self.url,
|
|
370
|
+
"final_url": self.url,
|
|
371
|
+
"redirect_chain": [],
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return {
|
|
375
|
+
"redirects": self.url != self.final_url,
|
|
376
|
+
"redirect_count": len(self.redirect_chain),
|
|
377
|
+
"original_url": self.url,
|
|
378
|
+
"final_url": self.final_url,
|
|
379
|
+
"redirect_chain": self.redirect_chain,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
@require_optional_import(["requests"], "rag")
|
|
383
|
+
def follow_redirects(self) -> Tuple[str, list[str]]:
|
|
384
|
+
"""
|
|
385
|
+
Follow redirects for the URL without analyzing content types.
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
Tuple[str, list[str]]: The final URL and the redirect chain
|
|
389
|
+
"""
|
|
390
|
+
try:
|
|
391
|
+
response = requests.head(self.url, allow_redirects=True, timeout=5)
|
|
392
|
+
|
|
393
|
+
# If the server returns a 405 (Method Not Allowed) for HEAD, try GET
|
|
394
|
+
if response.status_code == 405:
|
|
395
|
+
response = requests.get(self.url, allow_redirects=True, timeout=5, stream=True)
|
|
396
|
+
# Close the connection without downloading the content
|
|
397
|
+
response.close()
|
|
398
|
+
|
|
399
|
+
# Update redirect information
|
|
400
|
+
if hasattr(response, "history") and response.history:
|
|
401
|
+
self.redirect_chain = [r.url for r in response.history]
|
|
402
|
+
self.final_url = response.url
|
|
403
|
+
else:
|
|
404
|
+
self.final_url = self.url
|
|
405
|
+
self.redirect_chain = []
|
|
406
|
+
|
|
407
|
+
return self.final_url, self.redirect_chain
|
|
408
|
+
|
|
409
|
+
except Exception:
|
|
410
|
+
# If the request fails, return the original URL
|
|
411
|
+
return self.url, []
|
|
412
|
+
|
|
413
|
+
@classmethod
|
|
414
|
+
def get_supported_formats(cls) -> list[InputFormat]:
|
|
415
|
+
"""Return a list of supported file formats."""
|
|
416
|
+
return list(cls.FormatToMimeType.keys())
|
|
417
|
+
|
|
418
|
+
@classmethod
|
|
419
|
+
def get_supported_mime_types(cls) -> list[str]:
|
|
420
|
+
"""Return a list of all supported MIME types."""
|
|
421
|
+
return list(cls.MimeTypeToFormat.keys())
|
|
422
|
+
|
|
423
|
+
@classmethod
|
|
424
|
+
def get_supported_extensions(cls) -> list[str]:
|
|
425
|
+
"""Return a list of supported file extensions."""
|
|
426
|
+
return list(ExtensionToFormat.keys())
|