deepset-mcp 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. deepset_mcp/__init__.py +0 -0
  2. deepset_mcp/agents/__init__.py +0 -0
  3. deepset_mcp/agents/debugging/__init__.py +0 -0
  4. deepset_mcp/agents/debugging/debugging_agent.py +37 -0
  5. deepset_mcp/agents/debugging/system_prompt.md +214 -0
  6. deepset_mcp/agents/generalist/__init__.py +0 -0
  7. deepset_mcp/agents/generalist/generalist_agent.py +38 -0
  8. deepset_mcp/agents/generalist/system_prompt.md +241 -0
  9. deepset_mcp/api/README.md +536 -0
  10. deepset_mcp/api/__init__.py +0 -0
  11. deepset_mcp/api/client.py +277 -0
  12. deepset_mcp/api/custom_components/__init__.py +0 -0
  13. deepset_mcp/api/custom_components/models.py +25 -0
  14. deepset_mcp/api/custom_components/protocols.py +17 -0
  15. deepset_mcp/api/custom_components/resource.py +56 -0
  16. deepset_mcp/api/exceptions.py +70 -0
  17. deepset_mcp/api/haystack_service/__init__.py +0 -0
  18. deepset_mcp/api/haystack_service/protocols.py +13 -0
  19. deepset_mcp/api/haystack_service/resource.py +55 -0
  20. deepset_mcp/api/indexes/__init__.py +0 -0
  21. deepset_mcp/api/indexes/models.py +63 -0
  22. deepset_mcp/api/indexes/protocols.py +53 -0
  23. deepset_mcp/api/indexes/resource.py +138 -0
  24. deepset_mcp/api/integrations/__init__.py +1 -0
  25. deepset_mcp/api/integrations/models.py +49 -0
  26. deepset_mcp/api/integrations/protocols.py +27 -0
  27. deepset_mcp/api/integrations/resource.py +57 -0
  28. deepset_mcp/api/pipeline/__init__.py +17 -0
  29. deepset_mcp/api/pipeline/log_level.py +9 -0
  30. deepset_mcp/api/pipeline/models.py +235 -0
  31. deepset_mcp/api/pipeline/protocols.py +83 -0
  32. deepset_mcp/api/pipeline/resource.py +378 -0
  33. deepset_mcp/api/pipeline_template/__init__.py +0 -0
  34. deepset_mcp/api/pipeline_template/models.py +56 -0
  35. deepset_mcp/api/pipeline_template/protocols.py +17 -0
  36. deepset_mcp/api/pipeline_template/resource.py +88 -0
  37. deepset_mcp/api/protocols.py +122 -0
  38. deepset_mcp/api/secrets/__init__.py +0 -0
  39. deepset_mcp/api/secrets/models.py +16 -0
  40. deepset_mcp/api/secrets/protocols.py +29 -0
  41. deepset_mcp/api/secrets/resource.py +112 -0
  42. deepset_mcp/api/shared_models.py +17 -0
  43. deepset_mcp/api/transport.py +336 -0
  44. deepset_mcp/api/user/__init__.py +0 -0
  45. deepset_mcp/api/user/protocols.py +11 -0
  46. deepset_mcp/api/user/resource.py +38 -0
  47. deepset_mcp/api/workspace/__init__.py +7 -0
  48. deepset_mcp/api/workspace/models.py +23 -0
  49. deepset_mcp/api/workspace/protocols.py +41 -0
  50. deepset_mcp/api/workspace/resource.py +94 -0
  51. deepset_mcp/benchmark/README.md +425 -0
  52. deepset_mcp/benchmark/__init__.py +1 -0
  53. deepset_mcp/benchmark/agent_configs/debugging_agent.yml +10 -0
  54. deepset_mcp/benchmark/agent_configs/generalist_agent.yml +6 -0
  55. deepset_mcp/benchmark/dp_validation_error_analysis/__init__.py +0 -0
  56. deepset_mcp/benchmark/dp_validation_error_analysis/eda.ipynb +757 -0
  57. deepset_mcp/benchmark/dp_validation_error_analysis/prepare_interaction_data.ipynb +167 -0
  58. deepset_mcp/benchmark/dp_validation_error_analysis/preprocessing_utils.py +213 -0
  59. deepset_mcp/benchmark/runner/__init__.py +0 -0
  60. deepset_mcp/benchmark/runner/agent_benchmark_runner.py +561 -0
  61. deepset_mcp/benchmark/runner/agent_loader.py +110 -0
  62. deepset_mcp/benchmark/runner/cli.py +39 -0
  63. deepset_mcp/benchmark/runner/cli_agent.py +373 -0
  64. deepset_mcp/benchmark/runner/cli_index.py +71 -0
  65. deepset_mcp/benchmark/runner/cli_pipeline.py +73 -0
  66. deepset_mcp/benchmark/runner/cli_tests.py +226 -0
  67. deepset_mcp/benchmark/runner/cli_utils.py +61 -0
  68. deepset_mcp/benchmark/runner/config.py +73 -0
  69. deepset_mcp/benchmark/runner/config_loader.py +64 -0
  70. deepset_mcp/benchmark/runner/interactive.py +140 -0
  71. deepset_mcp/benchmark/runner/models.py +203 -0
  72. deepset_mcp/benchmark/runner/repl.py +67 -0
  73. deepset_mcp/benchmark/runner/setup_actions.py +238 -0
  74. deepset_mcp/benchmark/runner/streaming.py +360 -0
  75. deepset_mcp/benchmark/runner/teardown_actions.py +196 -0
  76. deepset_mcp/benchmark/runner/tracing.py +21 -0
  77. deepset_mcp/benchmark/tasks/chat_rag_answers_wrong_format.yml +16 -0
  78. deepset_mcp/benchmark/tasks/documents_output_wrong.yml +13 -0
  79. deepset_mcp/benchmark/tasks/jinja_str_instead_of_complex_type.yml +11 -0
  80. deepset_mcp/benchmark/tasks/jinja_syntax_error.yml +11 -0
  81. deepset_mcp/benchmark/tasks/missing_output_mapping.yml +14 -0
  82. deepset_mcp/benchmark/tasks/no_query_input.yml +13 -0
  83. deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_str.yml +141 -0
  84. deepset_mcp/benchmark/tasks/pipelines/chat_agent_jinja_syntax.yml +141 -0
  85. deepset_mcp/benchmark/tasks/pipelines/chat_rag_answers_wrong_format.yml +181 -0
  86. deepset_mcp/benchmark/tasks/pipelines/chat_rag_missing_output_mapping.yml +189 -0
  87. deepset_mcp/benchmark/tasks/pipelines/rag_documents_wrong_format.yml +193 -0
  88. deepset_mcp/benchmark/tasks/pipelines/rag_no_query_input.yml +191 -0
  89. deepset_mcp/benchmark/tasks/pipelines/standard_index.yml +167 -0
  90. deepset_mcp/initialize_embedding_model.py +12 -0
  91. deepset_mcp/main.py +133 -0
  92. deepset_mcp/prompts/deepset_copilot_prompt.md +271 -0
  93. deepset_mcp/prompts/deepset_debugging_agent.md +214 -0
  94. deepset_mcp/store.py +5 -0
  95. deepset_mcp/tool_factory.py +473 -0
  96. deepset_mcp/tools/__init__.py +0 -0
  97. deepset_mcp/tools/custom_components.py +52 -0
  98. deepset_mcp/tools/doc_search.py +83 -0
  99. deepset_mcp/tools/haystack_service.py +358 -0
  100. deepset_mcp/tools/haystack_service_models.py +97 -0
  101. deepset_mcp/tools/indexes.py +129 -0
  102. deepset_mcp/tools/model_protocol.py +16 -0
  103. deepset_mcp/tools/pipeline.py +335 -0
  104. deepset_mcp/tools/pipeline_template.py +116 -0
  105. deepset_mcp/tools/secrets.py +45 -0
  106. deepset_mcp/tools/tokonomics/__init__.py +73 -0
  107. deepset_mcp/tools/tokonomics/decorators.py +396 -0
  108. deepset_mcp/tools/tokonomics/explorer.py +347 -0
  109. deepset_mcp/tools/tokonomics/object_store.py +177 -0
  110. deepset_mcp/tools/workspace.py +61 -0
  111. deepset_mcp-0.0.2.dist-info/METADATA +288 -0
  112. deepset_mcp-0.0.2.dist-info/RECORD +114 -0
  113. deepset_mcp-0.0.2.dist-info/WHEEL +4 -0
  114. deepset_mcp-0.0.2.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,193 @@
1
+ components:
2
+ chat_summary_prompt_builder:
3
+ type: haystack.components.builders.prompt_builder.PromptBuilder
4
+ init_parameters:
5
+ template: |-
6
+ You are part of a chatbot.
7
+ You receive a question (Current Question) and a chat history.
8
+ Use the context from the chat history and reformulate the question so that it is suitable for retrieval augmented generation.
9
+ If X is followed by Y, only ask for Y and do not repeat X again.
10
+ If the question does not require any context from the chat history, output it unedited.
11
+ Don't make questions too long, but short and precise.
12
+ Stay as close as possible to the current question.
13
+ Only output the new question, nothing else!
14
+
15
+ {{ question }}
16
+
17
+ New question:
18
+
19
+ chat_summary_llm:
20
+ type: haystack.components.generators.openai.OpenAIGenerator
21
+ init_parameters:
22
+ api_key: {"type": "env_var", "env_vars": ["OPENAI_API_KEY"], "strict": false}
23
+ model: "gpt-4o"
24
+ generation_kwargs:
25
+ max_tokens: 650
26
+ temperature: 0
27
+ seed: 0
28
+
29
+ replies_to_query:
30
+ type: haystack.components.converters.output_adapter.OutputAdapter
31
+ init_parameters:
32
+ template: "{{ replies[0] }}"
33
+ output_type: str
34
+
35
+ bm25_retriever: # Selects the most similar documents from the document store
36
+ type: haystack_integrations.components.retrievers.opensearch.bm25_retriever.OpenSearchBM25Retriever
37
+ init_parameters:
38
+ document_store:
39
+ type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
40
+ init_parameters:
41
+ hosts:
42
+ index: 'standard-index'
43
+ max_chunk_bytes: 104857600
44
+ embedding_dim: 768
45
+ return_embedding: false
46
+ method:
47
+ mappings:
48
+ settings:
49
+ create_index: true
50
+ http_auth:
51
+ use_ssl:
52
+ verify_certs:
53
+ timeout:
54
+ top_k: 20 # The number of results to return
55
+
56
+ query_embedder:
57
+ type: deepset_cloud_custom_nodes.embedders.nvidia.text_embedder.DeepsetNvidiaTextEmbedder
58
+ init_parameters:
59
+ normalize_embeddings: true
60
+ model: intfloat/e5-base-v2
61
+
62
+
63
+ embedding_retriever: # Selects the most similar documents from the document store
64
+ type: haystack_integrations.components.retrievers.opensearch.embedding_retriever.OpenSearchEmbeddingRetriever
65
+ init_parameters:
66
+ document_store:
67
+ type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
68
+ init_parameters:
69
+ hosts:
70
+ index: 'standard-index'
71
+ max_chunk_bytes: 104857600
72
+ embedding_dim: 768
73
+ return_embedding: false
74
+ method:
75
+ mappings:
76
+ settings:
77
+ create_index: true
78
+ http_auth:
79
+ use_ssl:
80
+ verify_certs:
81
+ timeout:
82
+ top_k: 20 # The number of results to return
83
+
84
+ document_joiner:
85
+ type: haystack.components.joiners.document_joiner.DocumentJoiner
86
+ init_parameters:
87
+ join_mode: concatenate
88
+
89
+ ranker:
90
+ type: deepset_cloud_custom_nodes.rankers.nvidia.ranker.DeepsetNvidiaRanker
91
+ init_parameters:
92
+ model: intfloat/simlm-msmarco-reranker
93
+ top_k: 8
94
+
95
+
96
+ qa_prompt_builder:
97
+ type: haystack.components.builders.prompt_builder.PromptBuilder
98
+ init_parameters:
99
+ template: |-
100
+ You are a technical expert.
101
+ You answer questions truthfully based on provided documents.
102
+ Ignore typing errors in the question.
103
+ For each document check whether it is related to the question.
104
+ Only use documents that are related to the question to answer it.
105
+ Ignore documents that are not related to the question.
106
+ If the answer exists in several documents, summarize them.
107
+ Only answer based on the documents provided. Don't make things up.
108
+ Just output the structured, informative and precise answer and nothing else.
109
+ If the documents can't answer the question, say so.
110
+ Always use references in the form [NUMBER OF DOCUMENT] when using information from a document, e.g. [3] for Document [3] .
111
+ Never name the documents, only enter a number in square brackets as a reference.
112
+ The reference must only refer to the number that comes in square brackets after the document.
113
+ Otherwise, do not use brackets in your answer and reference ONLY the number of the document without mentioning the word document.
114
+
115
+ These are the documents:
116
+ {%- if documents|length > 0 %}
117
+ {%- for document in documents %}
118
+ Document [{{ loop.index }}] :
119
+ Name of Source File: {{ document.meta.file_name }}
120
+ {{ document.content }}
121
+ {% endfor -%}
122
+ {%- else %}
123
+ No relevant documents found.
124
+ Respond with "Sorry, no matching documents were found, please adjust the filters or try a different question."
125
+ {% endif %}
126
+
127
+ Question: {{ question }}
128
+ Answer:
129
+
130
+ qa_llm:
131
+ type: haystack.components.generators.openai.OpenAIGenerator
132
+ init_parameters:
133
+ api_key: {"type": "env_var", "env_vars": ["OPENAI_API_KEY"], "strict": false}
134
+ model: "gpt-4o"
135
+ generation_kwargs:
136
+ max_tokens: 650
137
+ temperature: 0
138
+ seed: 0
139
+
140
+ answer_builder:
141
+ type: deepset_cloud_custom_nodes.augmenters.deepset_answer_builder.DeepsetAnswerBuilder
142
+ init_parameters:
143
+ reference_pattern: acm
144
+
145
+ connections: # Defines how the components are connected
146
+ - sender: chat_summary_prompt_builder.prompt
147
+ receiver: chat_summary_llm.prompt
148
+ - sender: chat_summary_llm.replies
149
+ receiver: replies_to_query.replies
150
+ - sender: replies_to_query.output
151
+ receiver: bm25_retriever.query
152
+ - sender: replies_to_query.output
153
+ receiver: query_embedder.text
154
+ - sender: replies_to_query.output
155
+ receiver: ranker.query
156
+ - sender: replies_to_query.output
157
+ receiver: qa_prompt_builder.question
158
+ - sender: replies_to_query.output
159
+ receiver: answer_builder.query
160
+ - sender: bm25_retriever.documents
161
+ receiver: document_joiner.documents
162
+ - sender: query_embedder.embedding
163
+ receiver: embedding_retriever.query_embedding
164
+ - sender: embedding_retriever.documents
165
+ receiver: document_joiner.documents
166
+ - sender: document_joiner.documents
167
+ receiver: ranker.documents
168
+ - sender: ranker.documents
169
+ receiver: qa_prompt_builder.documents
170
+ - sender: ranker.documents
171
+ receiver: answer_builder.documents
172
+ - sender: qa_prompt_builder.prompt
173
+ receiver: qa_llm.prompt
174
+ - sender: qa_prompt_builder.prompt
175
+ receiver: answer_builder.prompt
176
+ - sender: qa_llm.replies
177
+ receiver: answer_builder.replies
178
+
179
+ inputs: # Define the inputs for your pipeline
180
+ query: # These components will receive the query as input
181
+ - "chat_summary_prompt_builder.question"
182
+
183
+ filters: # These components will receive a potential query filter as input
184
+ - "bm25_retriever.filters"
185
+ - "embedding_retriever.filters"
186
+
187
+ outputs: # Defines the output of your pipeline
188
+ documents: "qa_prompt_builder.prompt" # The output of the pipeline is the retrieved documents
189
+ answers: "answer_builder.answers" # The output of the pipeline is the generated answers
190
+
191
+ max_runs_per_component: 100
192
+
193
+ metadata: {}
@@ -0,0 +1,191 @@
1
+ components:
2
+ chat_summary_prompt_builder:
3
+ type: haystack.components.builders.prompt_builder.PromptBuilder
4
+ init_parameters:
5
+ template: |-
6
+ You are part of a chatbot.
7
+ You receive a question (Current Question) and a chat history.
8
+ Use the context from the chat history and reformulate the question so that it is suitable for retrieval augmented generation.
9
+ If X is followed by Y, only ask for Y and do not repeat X again.
10
+ If the question does not require any context from the chat history, output it unedited.
11
+ Don't make questions too long, but short and precise.
12
+ Stay as close as possible to the current question.
13
+ Only output the new question, nothing else!
14
+
15
+ {{ question }}
16
+
17
+ New question:
18
+
19
+ chat_summary_llm:
20
+ type: haystack.components.generators.openai.OpenAIGenerator
21
+ init_parameters:
22
+ api_key: {"type": "env_var", "env_vars": ["OPENAI_API_KEY"], "strict": false}
23
+ model: "gpt-4o"
24
+ generation_kwargs:
25
+ max_tokens: 650
26
+ temperature: 0
27
+ seed: 0
28
+
29
+ replies_to_query:
30
+ type: haystack.components.converters.output_adapter.OutputAdapter
31
+ init_parameters:
32
+ template: "{{ replies[0] }}"
33
+ output_type: str
34
+
35
+ bm25_retriever: # Selects the most similar documents from the document store
36
+ type: haystack_integrations.components.retrievers.opensearch.bm25_retriever.OpenSearchBM25Retriever
37
+ init_parameters:
38
+ document_store:
39
+ type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
40
+ init_parameters:
41
+ hosts:
42
+ index: 'standard-index'
43
+ max_chunk_bytes: 104857600
44
+ embedding_dim: 768
45
+ return_embedding: false
46
+ method:
47
+ mappings:
48
+ settings:
49
+ create_index: true
50
+ http_auth:
51
+ use_ssl:
52
+ verify_certs:
53
+ timeout:
54
+ top_k: 20 # The number of results to return
55
+
56
+ query_embedder:
57
+ type: deepset_cloud_custom_nodes.embedders.nvidia.text_embedder.DeepsetNvidiaTextEmbedder
58
+ init_parameters:
59
+ normalize_embeddings: true
60
+ model: intfloat/e5-base-v2
61
+
62
+
63
+ embedding_retriever: # Selects the most similar documents from the document store
64
+ type: haystack_integrations.components.retrievers.opensearch.embedding_retriever.OpenSearchEmbeddingRetriever
65
+ init_parameters:
66
+ document_store:
67
+ type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
68
+ init_parameters:
69
+ hosts:
70
+ index: 'standard-index'
71
+ max_chunk_bytes: 104857600
72
+ embedding_dim: 768
73
+ return_embedding: false
74
+ method:
75
+ mappings:
76
+ settings:
77
+ create_index: true
78
+ http_auth:
79
+ use_ssl:
80
+ verify_certs:
81
+ timeout:
82
+ top_k: 20 # The number of results to return
83
+
84
+ document_joiner:
85
+ type: haystack.components.joiners.document_joiner.DocumentJoiner
86
+ init_parameters:
87
+ join_mode: concatenate
88
+
89
+ ranker:
90
+ type: deepset_cloud_custom_nodes.rankers.nvidia.ranker.DeepsetNvidiaRanker
91
+ init_parameters:
92
+ model: intfloat/simlm-msmarco-reranker
93
+ top_k: 8
94
+
95
+
96
+ qa_prompt_builder:
97
+ type: haystack.components.builders.prompt_builder.PromptBuilder
98
+ init_parameters:
99
+ template: |-
100
+ You are a technical expert.
101
+ You answer questions truthfully based on provided documents.
102
+ Ignore typing errors in the question.
103
+ For each document check whether it is related to the question.
104
+ Only use documents that are related to the question to answer it.
105
+ Ignore documents that are not related to the question.
106
+ If the answer exists in several documents, summarize them.
107
+ Only answer based on the documents provided. Don't make things up.
108
+ Just output the structured, informative and precise answer and nothing else.
109
+ If the documents can't answer the question, say so.
110
+ Always use references in the form [NUMBER OF DOCUMENT] when using information from a document, e.g. [3] for Document [3] .
111
+ Never name the documents, only enter a number in square brackets as a reference.
112
+ The reference must only refer to the number that comes in square brackets after the document.
113
+ Otherwise, do not use brackets in your answer and reference ONLY the number of the document without mentioning the word document.
114
+
115
+ These are the documents:
116
+ {%- if documents|length > 0 %}
117
+ {%- for document in documents %}
118
+ Document [{{ loop.index }}] :
119
+ Name of Source File: {{ document.meta.file_name }}
120
+ {{ document.content }}
121
+ {% endfor -%}
122
+ {%- else %}
123
+ No relevant documents found.
124
+ Respond with "Sorry, no matching documents were found, please adjust the filters or try a different question."
125
+ {% endif %}
126
+
127
+ Question: {{ question }}
128
+ Answer:
129
+
130
+ qa_llm:
131
+ type: haystack.components.generators.openai.OpenAIGenerator
132
+ init_parameters:
133
+ api_key: {"type": "env_var", "env_vars": ["OPENAI_API_KEY"], "strict": false}
134
+ model: "gpt-4o"
135
+ generation_kwargs:
136
+ max_tokens: 650
137
+ temperature: 0
138
+ seed: 0
139
+
140
+ answer_builder:
141
+ type: deepset_cloud_custom_nodes.augmenters.deepset_answer_builder.DeepsetAnswerBuilder
142
+ init_parameters:
143
+ reference_pattern: acm
144
+
145
+ connections: # Defines how the components are connected
146
+ - sender: chat_summary_prompt_builder.prompt
147
+ receiver: chat_summary_llm.prompt
148
+ - sender: chat_summary_llm.replies
149
+ receiver: replies_to_query.replies
150
+ - sender: replies_to_query.output
151
+ receiver: bm25_retriever.query
152
+ - sender: replies_to_query.output
153
+ receiver: query_embedder.text
154
+ - sender: replies_to_query.output
155
+ receiver: ranker.query
156
+ - sender: replies_to_query.output
157
+ receiver: qa_prompt_builder.question
158
+ - sender: replies_to_query.output
159
+ receiver: answer_builder.query
160
+ - sender: bm25_retriever.documents
161
+ receiver: document_joiner.documents
162
+ - sender: query_embedder.embedding
163
+ receiver: embedding_retriever.query_embedding
164
+ - sender: embedding_retriever.documents
165
+ receiver: document_joiner.documents
166
+ - sender: document_joiner.documents
167
+ receiver: ranker.documents
168
+ - sender: ranker.documents
169
+ receiver: qa_prompt_builder.documents
170
+ - sender: ranker.documents
171
+ receiver: answer_builder.documents
172
+ - sender: qa_prompt_builder.prompt
173
+ receiver: qa_llm.prompt
174
+ - sender: qa_prompt_builder.prompt
175
+ receiver: answer_builder.prompt
176
+ - sender: qa_llm.replies
177
+ receiver: answer_builder.replies
178
+
179
+ inputs: # Define the inputs for your pipeline
180
+
181
+ filters: # These components will receive a potential query filter as input
182
+ - "bm25_retriever.filters"
183
+ - "embedding_retriever.filters"
184
+
185
+ outputs: # Defines the output of your pipeline
186
+ documents: "ranker.documents" # The output of the pipeline is the retrieved documents
187
+ answers: "answer_builder.answers" # The output of the pipeline is the generated answers
188
+
189
+ max_runs_per_component: 100
190
+
191
+ metadata: {}
@@ -0,0 +1,167 @@
1
+ # If you need help with the YAML format, have a look at https://docs.cloud.deepset.ai/v2.0/docs/create-a-pipeline#create-a-pipeline-using-pipeline-editor.
2
+ # This section defines components that you want to use in your pipelines. Each component must have a name and a type. You can also set the component's parameters here.
3
+ # The name is up to you, you can give your component a friendly name. You then use components' names when specifying the connections in the pipeline.
4
+ # Type is the class path of the component. You can check the type on the component's documentation page.
5
+ components:
6
+ file_classifier:
7
+ type: haystack.components.routers.file_type_router.FileTypeRouter
8
+ init_parameters:
9
+ mime_types:
10
+ - text/plain
11
+ - application/pdf
12
+ - text/markdown
13
+ - text/html
14
+ - application/vnd.openxmlformats-officedocument.wordprocessingml.document
15
+ - application/vnd.openxmlformats-officedocument.presentationml.presentation
16
+ - application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
17
+ - text/csv
18
+
19
+ text_converter:
20
+ type: haystack.components.converters.txt.TextFileToDocument
21
+ init_parameters:
22
+ encoding: utf-8
23
+
24
+ pdf_converter:
25
+ type: haystack.components.converters.pdfminer.PDFMinerToDocument
26
+ init_parameters:
27
+ line_overlap: 0.5
28
+ char_margin: 2
29
+ line_margin: 0.5
30
+ word_margin: 0.1
31
+ boxes_flow: 0.5
32
+ detect_vertical: true
33
+ all_texts: false
34
+ store_full_path: false
35
+
36
+ markdown_converter:
37
+ type: haystack.components.converters.txt.TextFileToDocument
38
+ init_parameters:
39
+ encoding: utf-8
40
+
41
+ html_converter:
42
+ type: haystack.components.converters.html.HTMLToDocument
43
+ init_parameters:
44
+ # A dictionary of keyword arguments to customize how you want to extract content from your HTML files.
45
+ # For the full list of available arguments, see
46
+ # the [Trafilatura documentation](https://trafilatura.readthedocs.io/en/latest/corefunctions.html#extract).
47
+ extraction_kwargs:
48
+ output_format: markdown # Extract text from HTML. You can also also choose "txt"
49
+ target_language: # You can define a language (using the ISO 639-1 format) to discard documents that don't match that language.
50
+ include_tables: true # If true, includes tables in the output
51
+ include_links: true # If true, keeps links along with their targets
52
+
53
+ docx_converter:
54
+ type: haystack.components.converters.docx.DOCXToDocument
55
+ init_parameters:
56
+ link_format: markdown
57
+
58
+ pptx_converter:
59
+ type: haystack.components.converters.pptx.PPTXToDocument
60
+ init_parameters: {}
61
+
62
+ xlsx_converter:
63
+ type: haystack.components.converters.xlsx.XLSXToDocument
64
+ init_parameters: {}
65
+
66
+ csv_converter:
67
+ type: haystack.components.converters.csv.CSVToDocument
68
+ init_parameters:
69
+ encoding: utf-8
70
+
71
+ joiner:
72
+ type: haystack.components.joiners.document_joiner.DocumentJoiner
73
+ init_parameters:
74
+ join_mode: concatenate
75
+ sort_by_score: false
76
+
77
+ joiner_xlsx: # merge split documents with non-split xlsx documents
78
+ type: haystack.components.joiners.document_joiner.DocumentJoiner
79
+ init_parameters:
80
+ join_mode: concatenate
81
+ sort_by_score: false
82
+
83
+ splitter:
84
+ type: haystack.components.preprocessors.document_splitter.DocumentSplitter
85
+ init_parameters:
86
+ split_by: word
87
+ split_length: 250
88
+ split_overlap: 30
89
+ respect_sentence_boundary: true
90
+ language: en
91
+
92
+ document_embedder:
93
+ type: haystack.components.embedders.sentence_transformers_document_embedder.SentenceTransformersDocumentEmbedder
94
+ init_parameters:
95
+ normalize_embeddings: true
96
+ model: intfloat/e5-base-v2
97
+
98
+ writer:
99
+ type: haystack.components.writers.document_writer.DocumentWriter
100
+ init_parameters:
101
+ document_store:
102
+ type: haystack_integrations.document_stores.opensearch.document_store.OpenSearchDocumentStore
103
+ init_parameters:
104
+ hosts:
105
+ index: ''
106
+ max_chunk_bytes: 104857600
107
+ embedding_dim: 768
108
+ return_embedding: false
109
+ method:
110
+ mappings:
111
+ settings:
112
+ create_index: true
113
+ http_auth:
114
+ use_ssl:
115
+ verify_certs:
116
+ timeout:
117
+ policy: OVERWRITE
118
+
119
+ connections: # Defines how the components are connected
120
+ - sender: file_classifier.text/plain
121
+ receiver: text_converter.sources
122
+ - sender: file_classifier.application/pdf
123
+ receiver: pdf_converter.sources
124
+ - sender: file_classifier.text/markdown
125
+ receiver: markdown_converter.sources
126
+ - sender: file_classifier.text/html
127
+ receiver: html_converter.sources
128
+ - sender: file_classifier.application/vnd.openxmlformats-officedocument.wordprocessingml.document
129
+ receiver: docx_converter.sources
130
+ - sender: file_classifier.application/vnd.openxmlformats-officedocument.presentationml.presentation
131
+ receiver: pptx_converter.sources
132
+ - sender: file_classifier.application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
133
+ receiver: xlsx_converter.sources
134
+ - sender: file_classifier.text/csv
135
+ receiver: csv_converter.sources
136
+ - sender: text_converter.documents
137
+ receiver: joiner.documents
138
+ - sender: pdf_converter.documents
139
+ receiver: joiner.documents
140
+ - sender: markdown_converter.documents
141
+ receiver: joiner.documents
142
+ - sender: html_converter.documents
143
+ receiver: joiner.documents
144
+ - sender: docx_converter.documents
145
+ receiver: joiner.documents
146
+ - sender: pptx_converter.documents
147
+ receiver: joiner.documents
148
+ - sender: joiner.documents
149
+ receiver: splitter.documents
150
+ - sender: splitter.documents
151
+ receiver: joiner_xlsx.documents
152
+ - sender: xlsx_converter.documents
153
+ receiver: joiner_xlsx.documents
154
+ - sender: csv_converter.documents
155
+ receiver: joiner_xlsx.documents
156
+ - sender: joiner_xlsx.documents
157
+ receiver: document_embedder.documents
158
+ - sender: document_embedder.documents
159
+ receiver: writer.documents
160
+
161
+ inputs: # Define the inputs for your pipeline
162
+ files: # This component will receive the files to index as input
163
+ - file_classifier.sources
164
+
165
+ max_runs_per_component: 100
166
+
167
+ metadata: {}
@@ -0,0 +1,12 @@
1
+ from functools import lru_cache
2
+
3
+ from model2vec import StaticModel
4
+
5
+
6
+ @lru_cache(maxsize=1)
7
+ def get_initialized_model() -> StaticModel:
8
+ """Gets the initialized embedding model.
9
+
10
+ The model is cached to avoid reloading.
11
+ """
12
+ return StaticModel.from_pretrained("minishlab/potion-base-2M")
deepset_mcp/main.py ADDED
@@ -0,0 +1,133 @@
1
+ import argparse
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from mcp.server.fastmcp import FastMCP
7
+
8
+ from deepset_mcp.tool_factory import WorkspaceMode, register_tools
9
+
10
+ # Initialize MCP Server
11
+ mcp = FastMCP("Deepset Cloud MCP", settings={"log_level": "ERROR"})
12
+
13
+ logging.getLogger("uvicorn").setLevel(logging.WARNING)
14
+ logging.getLogger("uvicorn.access").setLevel(logging.WARNING)
15
+ logging.getLogger("fastapi").setLevel(logging.WARNING)
16
+ logging.getLogger("httpx").setLevel(logging.WARNING)
17
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
18
+ logging.getLogger("mcp").setLevel(logging.WARNING)
19
+
20
+
21
+ @mcp.prompt()
22
+ async def deepset_copilot() -> str:
23
+ """System prompt for the deepset copilot."""
24
+ prompt_path = Path(__file__).parent / "prompts/deepset_copilot_prompt.md"
25
+
26
+ return prompt_path.read_text()
27
+
28
+
29
+ @mcp.prompt()
30
+ async def deepset_recommended_prompt() -> str:
31
+ """Recommended system prompt for the deepset copilot."""
32
+ prompt_path = Path(__file__).parent / "prompts/deepset_debugging_agent.md"
33
+
34
+ return prompt_path.read_text()
35
+
36
+
37
+ def main() -> None:
38
+ """Entrypoint for the deepset MCP server."""
39
+ parser = argparse.ArgumentParser(description="Run the Deepset MCP server.")
40
+ parser.add_argument(
41
+ "--workspace",
42
+ "-w",
43
+ help="Deepset workspace (env DEEPSET_WORKSPACE)",
44
+ )
45
+ parser.add_argument(
46
+ "--api-key",
47
+ "-k",
48
+ help="Deepset API key (env DEEPSET_API_KEY)",
49
+ )
50
+ parser.add_argument(
51
+ "--docs-workspace",
52
+ help="Deepset docs search workspace (env DEEPSET_DOCS_WORKSPACE)",
53
+ )
54
+ parser.add_argument(
55
+ "--docs-pipeline-name",
56
+ help="Deepset docs pipeline name (env DEEPSET_DOCS_PIPELINE_NAME)",
57
+ )
58
+ parser.add_argument(
59
+ "--docs-api-key",
60
+ help="Deepset docs pipeline API key (env DEEPSET_DOCS_API_KEY)",
61
+ )
62
+ parser.add_argument(
63
+ "--workspace-mode",
64
+ choices=["implicit", "explicit"],
65
+ default="implicit",
66
+ help="Whether workspace is implicit (from env) or explicit (as parameter). Default: implicit",
67
+ )
68
+ parser.add_argument(
69
+ "--tools",
70
+ nargs="*",
71
+ help="Space-separated list of tools to register (default: all)",
72
+ )
73
+ parser.add_argument(
74
+ "--list-tools",
75
+ action="store_true",
76
+ help="List all available tools and exit",
77
+ )
78
+ args = parser.parse_args()
79
+
80
+ # Handle --list-tools flag early
81
+ if args.list_tools:
82
+ from deepset_mcp.tool_factory import TOOL_REGISTRY
83
+
84
+ print("Available tools:")
85
+ for tool_name in sorted(TOOL_REGISTRY.keys()):
86
+ print(f" {tool_name}")
87
+ return
88
+
89
+ # prefer flags, fallback to env
90
+ workspace = args.workspace or os.getenv("DEEPSET_WORKSPACE")
91
+ api_key = args.api_key or os.getenv("DEEPSET_API_KEY")
92
+ docs_workspace = args.docs_workspace or os.getenv("DEEPSET_DOCS_WORKSPACE")
93
+ docs_pipeline_name = args.docs_pipeline_name or os.getenv("DEEPSET_DOCS_PIPELINE_NAME")
94
+ docs_api_key = args.docs_api_key or os.getenv("DEEPSET_DOCS_API_KEY")
95
+
96
+ # Create server configuration
97
+ workspace_mode = WorkspaceMode(args.workspace_mode)
98
+
99
+ # Only require workspace for implicit mode
100
+ if workspace_mode == WorkspaceMode.IMPLICIT:
101
+ if not workspace:
102
+ parser.error("Missing workspace: set --workspace or DEEPSET_WORKSPACE (required for implicit mode)")
103
+
104
+ if not api_key:
105
+ parser.error("Missing API key: set --api-key or DEEPSET_API_KEY")
106
+
107
+ # make sure downstream tools see them (for implicit mode)
108
+ if workspace:
109
+ os.environ["DEEPSET_WORKSPACE"] = workspace
110
+ os.environ["DEEPSET_API_KEY"] = api_key
111
+
112
+ # Set docs environment variables if provided
113
+ if docs_workspace:
114
+ os.environ["DEEPSET_DOCS_WORKSPACE"] = docs_workspace
115
+ if docs_pipeline_name:
116
+ os.environ["DEEPSET_DOCS_PIPELINE_NAME"] = docs_pipeline_name
117
+ if docs_api_key:
118
+ os.environ["DEEPSET_DOCS_API_KEY"] = docs_api_key
119
+
120
+ # Parse tool names if provided
121
+ tool_names = None
122
+ if args.tools:
123
+ tool_names = set(args.tools)
124
+
125
+ # Register tools based on configuration
126
+ register_tools(mcp, workspace_mode, workspace, tool_names)
127
+
128
+ # run with SSE transport (HTTP+Server-Sent Events)
129
+ mcp.run(transport="stdio")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()