qtype 0.0.16__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qtype/application/commons/tools.py +1 -1
- qtype/application/converters/tools_from_api.py +5 -5
- qtype/application/converters/tools_from_module.py +2 -2
- qtype/application/converters/types.py +14 -43
- qtype/application/documentation.py +1 -1
- qtype/application/facade.py +94 -73
- qtype/base/types.py +227 -7
- qtype/cli.py +4 -0
- qtype/commands/convert.py +20 -8
- qtype/commands/generate.py +19 -27
- qtype/commands/run.py +73 -36
- qtype/commands/serve.py +74 -54
- qtype/commands/validate.py +34 -8
- qtype/commands/visualize.py +46 -22
- qtype/dsl/__init__.py +6 -5
- qtype/dsl/custom_types.py +1 -1
- qtype/dsl/domain_types.py +65 -5
- qtype/dsl/linker.py +384 -0
- qtype/dsl/loader.py +315 -0
- qtype/dsl/model.py +612 -363
- qtype/dsl/parser.py +200 -0
- qtype/dsl/types.py +50 -0
- qtype/interpreter/api.py +57 -136
- qtype/interpreter/auth/aws.py +19 -9
- qtype/interpreter/auth/generic.py +93 -16
- qtype/interpreter/base/base_step_executor.py +436 -0
- qtype/interpreter/base/batch_step_executor.py +171 -0
- qtype/interpreter/base/exceptions.py +50 -0
- qtype/interpreter/base/executor_context.py +74 -0
- qtype/interpreter/base/factory.py +117 -0
- qtype/interpreter/base/progress_tracker.py +110 -0
- qtype/interpreter/base/secrets.py +339 -0
- qtype/interpreter/base/step_cache.py +74 -0
- qtype/interpreter/base/stream_emitter.py +469 -0
- qtype/interpreter/conversions.py +462 -22
- qtype/interpreter/converters.py +77 -0
- qtype/interpreter/endpoints.py +355 -0
- qtype/interpreter/executors/agent_executor.py +242 -0
- qtype/interpreter/executors/aggregate_executor.py +93 -0
- qtype/interpreter/executors/decoder_executor.py +163 -0
- qtype/interpreter/executors/doc_to_text_executor.py +112 -0
- qtype/interpreter/executors/document_embedder_executor.py +107 -0
- qtype/interpreter/executors/document_search_executor.py +122 -0
- qtype/interpreter/executors/document_source_executor.py +118 -0
- qtype/interpreter/executors/document_splitter_executor.py +105 -0
- qtype/interpreter/executors/echo_executor.py +63 -0
- qtype/interpreter/executors/field_extractor_executor.py +160 -0
- qtype/interpreter/executors/file_source_executor.py +101 -0
- qtype/interpreter/executors/file_writer_executor.py +110 -0
- qtype/interpreter/executors/index_upsert_executor.py +228 -0
- qtype/interpreter/executors/invoke_embedding_executor.py +92 -0
- qtype/interpreter/executors/invoke_flow_executor.py +51 -0
- qtype/interpreter/executors/invoke_tool_executor.py +358 -0
- qtype/interpreter/executors/llm_inference_executor.py +272 -0
- qtype/interpreter/executors/prompt_template_executor.py +78 -0
- qtype/interpreter/executors/sql_source_executor.py +106 -0
- qtype/interpreter/executors/vector_search_executor.py +91 -0
- qtype/interpreter/flow.py +159 -22
- qtype/interpreter/metadata_api.py +115 -0
- qtype/interpreter/resource_cache.py +5 -4
- qtype/interpreter/rich_progress.py +225 -0
- qtype/interpreter/stream/chat/__init__.py +15 -0
- qtype/interpreter/stream/chat/converter.py +391 -0
- qtype/interpreter/{chat → stream/chat}/file_conversions.py +2 -2
- qtype/interpreter/stream/chat/ui_request_to_domain_type.py +140 -0
- qtype/interpreter/stream/chat/vercel.py +609 -0
- qtype/interpreter/stream/utils/__init__.py +15 -0
- qtype/interpreter/stream/utils/build_vercel_ai_formatter.py +74 -0
- qtype/interpreter/stream/utils/callback_to_stream.py +66 -0
- qtype/interpreter/stream/utils/create_streaming_response.py +18 -0
- qtype/interpreter/stream/utils/default_chat_extract_text.py +20 -0
- qtype/interpreter/stream/utils/error_streaming_response.py +20 -0
- qtype/interpreter/telemetry.py +135 -8
- qtype/interpreter/tools/__init__.py +5 -0
- qtype/interpreter/tools/function_tool_helper.py +265 -0
- qtype/interpreter/types.py +330 -0
- qtype/interpreter/typing.py +83 -89
- qtype/interpreter/ui/404/index.html +1 -1
- qtype/interpreter/ui/404.html +1 -1
- qtype/interpreter/ui/_next/static/{nUaw6_IwRwPqkzwe5s725 → 20HoJN6otZ_LyHLHpCPE6}/_buildManifest.js +1 -1
- qtype/interpreter/ui/_next/static/chunks/{393-8fd474427f8e19ce.js → 434-b2112d19f25c44ff.js} +3 -3
- qtype/interpreter/ui/_next/static/chunks/app/page-8c67d16ac90d23cb.js +1 -0
- qtype/interpreter/ui/_next/static/chunks/ba12c10f-546f2714ff8abc66.js +1 -0
- qtype/interpreter/ui/_next/static/css/8a8d1269e362fef7.css +3 -0
- qtype/interpreter/ui/icon.png +0 -0
- qtype/interpreter/ui/index.html +1 -1
- qtype/interpreter/ui/index.txt +4 -4
- qtype/semantic/checker.py +583 -0
- qtype/semantic/generate.py +262 -83
- qtype/semantic/loader.py +95 -0
- qtype/semantic/model.py +436 -159
- qtype/semantic/resolver.py +63 -19
- qtype/semantic/visualize.py +28 -31
- {qtype-0.0.16.dist-info → qtype-0.1.1.dist-info}/METADATA +16 -3
- qtype-0.1.1.dist-info/RECORD +135 -0
- qtype/dsl/base_types.py +0 -38
- qtype/dsl/validator.py +0 -465
- qtype/interpreter/batch/__init__.py +0 -0
- qtype/interpreter/batch/file_sink_source.py +0 -162
- qtype/interpreter/batch/flow.py +0 -95
- qtype/interpreter/batch/sql_source.py +0 -92
- qtype/interpreter/batch/step.py +0 -74
- qtype/interpreter/batch/types.py +0 -41
- qtype/interpreter/batch/utils.py +0 -178
- qtype/interpreter/chat/chat_api.py +0 -237
- qtype/interpreter/chat/vercel.py +0 -314
- qtype/interpreter/exceptions.py +0 -10
- qtype/interpreter/step.py +0 -67
- qtype/interpreter/steps/__init__.py +0 -0
- qtype/interpreter/steps/agent.py +0 -114
- qtype/interpreter/steps/condition.py +0 -36
- qtype/interpreter/steps/decoder.py +0 -88
- qtype/interpreter/steps/llm_inference.py +0 -171
- qtype/interpreter/steps/prompt_template.py +0 -54
- qtype/interpreter/steps/search.py +0 -24
- qtype/interpreter/steps/tool.py +0 -219
- qtype/interpreter/streaming_helpers.py +0 -123
- qtype/interpreter/ui/_next/static/chunks/app/page-7e26b6156cfb55d3.js +0 -1
- qtype/interpreter/ui/_next/static/chunks/ba12c10f-22556063851a6df2.js +0 -1
- qtype/interpreter/ui/_next/static/css/b40532b0db09cce3.css +0 -3
- qtype/interpreter/ui/favicon.ico +0 -0
- qtype/loader.py +0 -390
- qtype-0.0.16.dist-info/RECORD +0 -106
- /qtype/interpreter/ui/_next/static/{nUaw6_IwRwPqkzwe5s725 → 20HoJN6otZ_LyHLHpCPE6}/_ssgManifest.js +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.1.dist-info}/WHEEL +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.1.dist-info}/entry_points.txt +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import xml.etree.ElementTree as ET
|
|
3
|
+
from typing import Any, AsyncIterator
|
|
4
|
+
|
|
5
|
+
from qtype.dsl.model import DecoderFormat
|
|
6
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
7
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
8
|
+
from qtype.interpreter.types import FlowMessage
|
|
9
|
+
from qtype.semantic.model import Decoder
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DecoderExecutor(StepExecutor):
|
|
13
|
+
"""Executor for Decoder steps."""
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self, step: Decoder, context: ExecutorContext, **dependencies
|
|
17
|
+
):
|
|
18
|
+
super().__init__(step, context, **dependencies)
|
|
19
|
+
if not isinstance(step, Decoder):
|
|
20
|
+
raise ValueError("DecoderExecutor can only execute Decoder steps.")
|
|
21
|
+
self.step: Decoder = step
|
|
22
|
+
|
|
23
|
+
def _parse_json(self, input_str: str) -> dict[str, Any]:
|
|
24
|
+
"""Parse a JSON string into a Python object.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
input_str: The JSON string to parse.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
A dictionary parsed from the JSON.
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
ValueError: If the JSON is invalid or not an object.
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
cleaned_response = input_str.strip()
|
|
37
|
+
# Remove markdown code fences if present
|
|
38
|
+
if cleaned_response.startswith("```json"):
|
|
39
|
+
cleaned_response = cleaned_response[7:]
|
|
40
|
+
if cleaned_response.endswith("```"):
|
|
41
|
+
cleaned_response = cleaned_response[:-3]
|
|
42
|
+
cleaned_response = cleaned_response.strip()
|
|
43
|
+
|
|
44
|
+
# Parse the JSON
|
|
45
|
+
parsed = json.loads(cleaned_response)
|
|
46
|
+
if not isinstance(parsed, dict):
|
|
47
|
+
raise ValueError(f"Parsed JSON is not an object: {parsed}")
|
|
48
|
+
return parsed
|
|
49
|
+
except json.JSONDecodeError as e:
|
|
50
|
+
raise ValueError(f"Invalid JSON input: {e}") from e
|
|
51
|
+
|
|
52
|
+
def _parse_xml(self, input_str: str) -> dict[str, Any]:
|
|
53
|
+
"""Parse an XML string into a Python object.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
input_str: The XML string to parse.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
A dictionary with tag names as keys and text content as values.
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ValueError: If the XML is invalid.
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
cleaned_response = input_str.strip()
|
|
66
|
+
# Remove markdown code fences if present
|
|
67
|
+
if cleaned_response.startswith("```xml"):
|
|
68
|
+
cleaned_response = cleaned_response[6:]
|
|
69
|
+
if cleaned_response.endswith("```"):
|
|
70
|
+
cleaned_response = cleaned_response[:-3]
|
|
71
|
+
cleaned_response = cleaned_response.strip()
|
|
72
|
+
|
|
73
|
+
# Escape ampersands
|
|
74
|
+
cleaned_response = cleaned_response.replace("&", "&")
|
|
75
|
+
tree = ET.fromstring(cleaned_response)
|
|
76
|
+
result = {c.tag: c.text for c in tree}
|
|
77
|
+
|
|
78
|
+
return result
|
|
79
|
+
except Exception as e:
|
|
80
|
+
raise ValueError(f"Invalid XML input: {e}") from e
|
|
81
|
+
|
|
82
|
+
def _parse(self, input_str: str) -> dict[str, Any]:
|
|
83
|
+
"""Parse input string based on the decoder format.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
input_str: The string to parse.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A dictionary parsed from the input.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If the format is unsupported or parsing fails.
|
|
93
|
+
"""
|
|
94
|
+
if self.step.format == DecoderFormat.json:
|
|
95
|
+
return self._parse_json(input_str)
|
|
96
|
+
elif self.step.format == DecoderFormat.xml:
|
|
97
|
+
return self._parse_xml(input_str)
|
|
98
|
+
else:
|
|
99
|
+
raise ValueError(
|
|
100
|
+
(
|
|
101
|
+
f"Unsupported decoder format: {self.step.format}. "
|
|
102
|
+
f"Supported formats are: {DecoderFormat.json}, "
|
|
103
|
+
f"{DecoderFormat.xml}."
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
async def process_message(
|
|
108
|
+
self,
|
|
109
|
+
message: FlowMessage,
|
|
110
|
+
) -> AsyncIterator[FlowMessage]:
|
|
111
|
+
"""Process a single FlowMessage for the Decoder step.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
message: The FlowMessage to process.
|
|
115
|
+
|
|
116
|
+
Yields:
|
|
117
|
+
A FlowMessage with decoded outputs or an error.
|
|
118
|
+
"""
|
|
119
|
+
input_id = self.step.inputs[0].id
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
# Get the input string to decode
|
|
123
|
+
input_value = message.variables.get(input_id)
|
|
124
|
+
if not isinstance(input_value, str):
|
|
125
|
+
raise ValueError(
|
|
126
|
+
(
|
|
127
|
+
f"Input to decoder step {self.step.id} must be "
|
|
128
|
+
f"a string, found {type(input_value).__name__}."
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
await self.stream_emitter.status(
|
|
133
|
+
f"Decoding {self.step.format.value} input"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Parse the input
|
|
137
|
+
result_dict = self._parse(input_value)
|
|
138
|
+
|
|
139
|
+
# Extract output variables from the parsed result
|
|
140
|
+
output_vars = {}
|
|
141
|
+
for output in self.step.outputs:
|
|
142
|
+
if output.id in result_dict:
|
|
143
|
+
output_vars[output.id] = result_dict[output.id]
|
|
144
|
+
else:
|
|
145
|
+
raise ValueError(
|
|
146
|
+
(
|
|
147
|
+
f"Output variable {output.id} not found in "
|
|
148
|
+
f"decoded result: {result_dict}"
|
|
149
|
+
)
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
await self.stream_emitter.status(
|
|
153
|
+
f"Decoded {len(output_vars)} output variables"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Yield the result
|
|
157
|
+
yield message.copy_with_variables(output_vars)
|
|
158
|
+
|
|
159
|
+
except Exception as e:
|
|
160
|
+
# Emit error event to stream so frontend can display it
|
|
161
|
+
await self.stream_emitter.error(str(e))
|
|
162
|
+
message.set_error(self.step.id, e)
|
|
163
|
+
yield message
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import AsyncIterator
|
|
3
|
+
|
|
4
|
+
from docling.document_converter import DocumentConverter
|
|
5
|
+
from docling_core.types.io import DocumentStream
|
|
6
|
+
|
|
7
|
+
from qtype.base.types import PrimitiveTypeEnum
|
|
8
|
+
from qtype.dsl.domain_types import RAGDocument
|
|
9
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
10
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
11
|
+
from qtype.interpreter.types import FlowMessage
|
|
12
|
+
from qtype.semantic.model import DocToTextConverter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DocToTextConverterExecutor(StepExecutor):
|
|
16
|
+
"""Executor for DocToTextConverter steps."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
step: DocToTextConverter,
|
|
21
|
+
context: ExecutorContext,
|
|
22
|
+
**dependencies,
|
|
23
|
+
):
|
|
24
|
+
super().__init__(step, context, **dependencies)
|
|
25
|
+
if not isinstance(step, DocToTextConverter):
|
|
26
|
+
raise ValueError(
|
|
27
|
+
(
|
|
28
|
+
"DocToTextConverterExecutor can only execute "
|
|
29
|
+
"DocToTextConverter steps."
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
self.step: DocToTextConverter = step
|
|
33
|
+
# Initialize the Docling converter once for the executor
|
|
34
|
+
self.docling_converter = DocumentConverter()
|
|
35
|
+
|
|
36
|
+
async def process_message(
|
|
37
|
+
self,
|
|
38
|
+
message: FlowMessage,
|
|
39
|
+
) -> AsyncIterator[FlowMessage]:
|
|
40
|
+
"""Process a single FlowMessage for the DocToTextConverter step.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
message: The FlowMessage to process.
|
|
44
|
+
Yields:
|
|
45
|
+
FlowMessage with converted document.
|
|
46
|
+
"""
|
|
47
|
+
input_id = self.step.inputs[0].id
|
|
48
|
+
output_id = self.step.outputs[0].id
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Get the input document
|
|
52
|
+
if input_id not in message.variables:
|
|
53
|
+
raise ValueError(f"Input variable '{input_id}' is missing")
|
|
54
|
+
doc = message.variables.get(input_id)
|
|
55
|
+
if not isinstance(doc, RAGDocument):
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Input variable '{input_id}' must be a RAGDocument"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
await self.stream_emitter.status(
|
|
61
|
+
f"Converting document: {doc.file_name}",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Convert the document
|
|
65
|
+
converted_doc = self._convert_doc(doc)
|
|
66
|
+
|
|
67
|
+
await self.stream_emitter.status(
|
|
68
|
+
f"Converted {doc.file_name} to markdown text",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Yield the result
|
|
72
|
+
yield message.copy_with_variables({output_id: converted_doc})
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
# Emit error event to stream so frontend can display it
|
|
76
|
+
await self.stream_emitter.error(str(e))
|
|
77
|
+
message.set_error(self.step.id, e)
|
|
78
|
+
yield message
|
|
79
|
+
|
|
80
|
+
def _convert_doc(self, doc: RAGDocument) -> RAGDocument:
|
|
81
|
+
"""Convert a RAGDocument to text/markdown format.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
doc: The document to convert.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
A RAGDocument with markdown text content.
|
|
88
|
+
"""
|
|
89
|
+
# If already text, no conversion needed
|
|
90
|
+
if doc.type == PrimitiveTypeEnum.text:
|
|
91
|
+
return doc
|
|
92
|
+
|
|
93
|
+
# Convert based on content type
|
|
94
|
+
if isinstance(doc.content, bytes):
|
|
95
|
+
# Use DocumentStream for bytes content
|
|
96
|
+
stream = DocumentStream(
|
|
97
|
+
name=doc.file_name, stream=BytesIO(doc.content)
|
|
98
|
+
)
|
|
99
|
+
document = self.docling_converter.convert(stream).document
|
|
100
|
+
else:
|
|
101
|
+
# Convert string content directly
|
|
102
|
+
document = self.docling_converter.convert(doc.content).document
|
|
103
|
+
|
|
104
|
+
# Export to markdown
|
|
105
|
+
markdown = document.export_to_markdown()
|
|
106
|
+
|
|
107
|
+
# Return new RAGDocument with markdown content
|
|
108
|
+
return RAGDocument(
|
|
109
|
+
**doc.model_dump(exclude={"content", "type"}),
|
|
110
|
+
content=markdown,
|
|
111
|
+
type=PrimitiveTypeEnum.text,
|
|
112
|
+
)
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import AsyncIterator
|
|
2
|
+
|
|
3
|
+
from botocore.exceptions import ClientError
|
|
4
|
+
from llama_index.core.base.embeddings.base import BaseEmbedding
|
|
5
|
+
from tenacity import (
|
|
6
|
+
retry,
|
|
7
|
+
retry_if_exception,
|
|
8
|
+
stop_after_attempt,
|
|
9
|
+
wait_exponential,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from qtype.dsl.domain_types import RAGChunk
|
|
13
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
14
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
15
|
+
from qtype.interpreter.conversions import to_embedding_model
|
|
16
|
+
from qtype.interpreter.types import FlowMessage
|
|
17
|
+
from qtype.semantic.model import DocumentEmbedder
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_throttling_error(e):
|
|
21
|
+
return (
|
|
22
|
+
isinstance(e, ClientError)
|
|
23
|
+
and e.response["Error"]["Code"] == "ThrottlingException"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DocumentEmbedderExecutor(StepExecutor):
|
|
28
|
+
"""Executor for DocumentEmbedder steps."""
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self, step: DocumentEmbedder, context: ExecutorContext, **dependencies
|
|
32
|
+
):
|
|
33
|
+
super().__init__(step, context, **dependencies)
|
|
34
|
+
if not isinstance(step, DocumentEmbedder):
|
|
35
|
+
raise ValueError(
|
|
36
|
+
(
|
|
37
|
+
"DocumentEmbedderExecutor can only execute "
|
|
38
|
+
"DocumentEmbedder steps."
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
self.step: DocumentEmbedder = step
|
|
42
|
+
# Initialize the embedding model once for the executor
|
|
43
|
+
self.embedding_model: BaseEmbedding = to_embedding_model(
|
|
44
|
+
self.step.model
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# TODO: properly abstract this into a mixin
|
|
48
|
+
@retry(
|
|
49
|
+
retry=retry_if_exception(is_throttling_error),
|
|
50
|
+
wait=wait_exponential(multiplier=0.5, min=1, max=30),
|
|
51
|
+
stop=stop_after_attempt(10),
|
|
52
|
+
)
|
|
53
|
+
async def _embed(self, text: str) -> list[float]:
|
|
54
|
+
"""Generate embedding for the given text using the embedding model.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
text: The text to embed.
|
|
58
|
+
Returns:
|
|
59
|
+
The embedding vector as a list of floats.
|
|
60
|
+
"""
|
|
61
|
+
return await self.embedding_model.aget_text_embedding(text=text)
|
|
62
|
+
|
|
63
|
+
async def process_message(
|
|
64
|
+
self,
|
|
65
|
+
message: FlowMessage,
|
|
66
|
+
) -> AsyncIterator[FlowMessage]:
|
|
67
|
+
"""Process a single FlowMessage for the DocumentEmbedder step.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
message: The FlowMessage to process.
|
|
71
|
+
Yields:
|
|
72
|
+
FlowMessage with embedded chunk.
|
|
73
|
+
"""
|
|
74
|
+
input_id = self.step.inputs[0].id
|
|
75
|
+
output_id = self.step.outputs[0].id
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
# Get the input chunk
|
|
79
|
+
chunk = message.variables.get(input_id)
|
|
80
|
+
if not isinstance(chunk, RAGChunk):
|
|
81
|
+
raise ValueError(
|
|
82
|
+
(
|
|
83
|
+
f"Input variable '{input_id}' must be a RAGChunk, "
|
|
84
|
+
f"got {type(chunk)}"
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Generate embedding for the chunk content
|
|
89
|
+
vector = await self._embed(str(chunk.content))
|
|
90
|
+
|
|
91
|
+
# Create the output chunk with the vector
|
|
92
|
+
embedded_chunk = RAGChunk(
|
|
93
|
+
vector=vector,
|
|
94
|
+
content=chunk.content,
|
|
95
|
+
chunk_id=chunk.chunk_id,
|
|
96
|
+
document_id=chunk.document_id,
|
|
97
|
+
metadata=chunk.metadata,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Yield the result
|
|
101
|
+
yield message.copy_with_variables({output_id: embedded_chunk})
|
|
102
|
+
|
|
103
|
+
except Exception as e:
|
|
104
|
+
# Emit error event to stream so frontend can display it
|
|
105
|
+
await self.stream_emitter.error(str(e))
|
|
106
|
+
message.set_error(self.step.id, e)
|
|
107
|
+
yield message
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from typing import AsyncIterator
|
|
2
|
+
|
|
3
|
+
from qtype.dsl.domain_types import RAGChunk, RAGSearchResult
|
|
4
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
5
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
6
|
+
from qtype.interpreter.conversions import to_opensearch_client
|
|
7
|
+
from qtype.interpreter.types import FlowMessage
|
|
8
|
+
from qtype.semantic.model import DocumentSearch
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentSearchExecutor(StepExecutor):
|
|
12
|
+
"""Executor for DocumentSearch steps using OpenSearch/Elasticsearch."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, step: DocumentSearch, context: ExecutorContext, **dependencies
|
|
16
|
+
):
|
|
17
|
+
super().__init__(step, context, **dependencies)
|
|
18
|
+
if not isinstance(step, DocumentSearch):
|
|
19
|
+
raise ValueError(
|
|
20
|
+
(
|
|
21
|
+
"DocumentSearchExecutor can only execute "
|
|
22
|
+
"DocumentSearch steps."
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
self.step: DocumentSearch = step
|
|
26
|
+
# Initialize the OpenSearch client once for the executor
|
|
27
|
+
self.client = to_opensearch_client(
|
|
28
|
+
self.step.index, self._secret_manager
|
|
29
|
+
)
|
|
30
|
+
self.index_name = self.step.index.name
|
|
31
|
+
|
|
32
|
+
async def process_message(
|
|
33
|
+
self,
|
|
34
|
+
message: FlowMessage,
|
|
35
|
+
) -> AsyncIterator[FlowMessage]:
|
|
36
|
+
"""Process a single FlowMessage for the DocumentSearch step.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
message: The FlowMessage to process.
|
|
40
|
+
|
|
41
|
+
Yields:
|
|
42
|
+
FlowMessage with search results as RAGSearchResult instances.
|
|
43
|
+
"""
|
|
44
|
+
input_id = self.step.inputs[0].id
|
|
45
|
+
output_id = self.step.outputs[0].id
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
# Get the search query text
|
|
49
|
+
query_text = message.variables.get(input_id)
|
|
50
|
+
if not isinstance(query_text, str):
|
|
51
|
+
raise ValueError(
|
|
52
|
+
(
|
|
53
|
+
f"Input variable '{input_id}' must be a string "
|
|
54
|
+
f"(text query), got {type(query_text)}"
|
|
55
|
+
)
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Build the search query
|
|
59
|
+
search_body = {
|
|
60
|
+
"query": {
|
|
61
|
+
"multi_match": {
|
|
62
|
+
"query": query_text,
|
|
63
|
+
"fields": ["content^2", "title", "*"],
|
|
64
|
+
"type": "best_fields",
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"size": 10, # Default top 10 results
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
# Apply any filters if specified
|
|
71
|
+
if self.step.filters:
|
|
72
|
+
if "query" in search_body:
|
|
73
|
+
search_body["query"] = {
|
|
74
|
+
"bool": {
|
|
75
|
+
"must": [search_body["query"]],
|
|
76
|
+
"filter": [
|
|
77
|
+
{"term": {k: v}}
|
|
78
|
+
for k, v in self.step.filters.items()
|
|
79
|
+
],
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Execute the search
|
|
84
|
+
response = self.client.search(
|
|
85
|
+
index=self.index_name, body=search_body
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# Process each hit and yield as RAGSearchResult
|
|
89
|
+
for hit in response["hits"]["hits"]:
|
|
90
|
+
source = hit["_source"]
|
|
91
|
+
doc_id = hit["_id"]
|
|
92
|
+
score = hit["_score"]
|
|
93
|
+
|
|
94
|
+
# Extract content (adjust field name based on your schema)
|
|
95
|
+
content = source.get("content", "")
|
|
96
|
+
|
|
97
|
+
# Build metadata from the source, excluding content field
|
|
98
|
+
metadata = {
|
|
99
|
+
k: v for k, v in source.items() if k not in ["content"]
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
# Create a RAGChunk from the search result
|
|
103
|
+
# Use the document ID as both chunk_id and document_id
|
|
104
|
+
chunk = RAGChunk(
|
|
105
|
+
content=content,
|
|
106
|
+
chunk_id=doc_id,
|
|
107
|
+
document_id=source.get("document_id", doc_id),
|
|
108
|
+
vector=None, # Document search doesn't return embeddings
|
|
109
|
+
metadata=metadata,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Wrap in RAGSearchResult with the score
|
|
113
|
+
search_result = RAGSearchResult(chunk=chunk, score=score)
|
|
114
|
+
|
|
115
|
+
# Yield result for each document
|
|
116
|
+
yield message.copy_with_variables({output_id: search_result})
|
|
117
|
+
|
|
118
|
+
except Exception as e:
|
|
119
|
+
# Emit error event to stream so frontend can display it
|
|
120
|
+
await self.stream_emitter.error(str(e))
|
|
121
|
+
message.set_error(self.step.id, e)
|
|
122
|
+
yield message
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from typing import AsyncIterator
|
|
3
|
+
|
|
4
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
5
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
6
|
+
from qtype.interpreter.conversions import from_llama_document
|
|
7
|
+
from qtype.interpreter.types import FlowMessage
|
|
8
|
+
from qtype.semantic.model import DocumentSource
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentSourceExecutor(StepExecutor):
|
|
12
|
+
"""Executor for DocumentSource steps."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, step: DocumentSource, context: ExecutorContext, **dependencies
|
|
16
|
+
):
|
|
17
|
+
super().__init__(step, context, **dependencies)
|
|
18
|
+
if not isinstance(step, DocumentSource):
|
|
19
|
+
raise ValueError(
|
|
20
|
+
(
|
|
21
|
+
"DocumentSourceExecutor can only execute "
|
|
22
|
+
"DocumentSource steps."
|
|
23
|
+
)
|
|
24
|
+
)
|
|
25
|
+
self.step: DocumentSource = step
|
|
26
|
+
# Initialize the reader class once for the executor
|
|
27
|
+
self.reader_class = self._load_reader_class()
|
|
28
|
+
|
|
29
|
+
def _load_reader_class(self) -> type:
|
|
30
|
+
"""Load the LlamaIndex reader class dynamically.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The reader class.
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ImportError: If the reader class cannot be imported.
|
|
37
|
+
"""
|
|
38
|
+
# Parse the reader module path
|
|
39
|
+
# Format: 'file.SimpleDirectoryReader' -> llama_index.readers.file + SimpleDirectoryReader
|
|
40
|
+
# Special case: 'file.SimpleDirectoryReader' is actually in llama_index.core
|
|
41
|
+
parts = self.step.reader_module.split(".")
|
|
42
|
+
module_path = ".".join(parts[:-1])
|
|
43
|
+
class_name = parts[-1]
|
|
44
|
+
|
|
45
|
+
# Dynamically import the reader module and get the class
|
|
46
|
+
try:
|
|
47
|
+
module = importlib.import_module(module_path)
|
|
48
|
+
reader_class = getattr(module, class_name)
|
|
49
|
+
return reader_class
|
|
50
|
+
except (ImportError, AttributeError) as e:
|
|
51
|
+
raise ImportError(
|
|
52
|
+
(
|
|
53
|
+
f"Failed to import reader class '{class_name}' "
|
|
54
|
+
f"from '{module_path}': {e}"
|
|
55
|
+
)
|
|
56
|
+
) from e
|
|
57
|
+
|
|
58
|
+
async def process_message(
|
|
59
|
+
self,
|
|
60
|
+
message: FlowMessage,
|
|
61
|
+
) -> AsyncIterator[FlowMessage]:
|
|
62
|
+
"""Process a single FlowMessage for the DocumentSource step.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
message: The FlowMessage to process.
|
|
66
|
+
Yields:
|
|
67
|
+
FlowMessages with loaded documents.
|
|
68
|
+
"""
|
|
69
|
+
output_id = self.step.outputs[0].id
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# Resolve any SecretReferences in step args
|
|
73
|
+
context = f"step '{self.step.id}'"
|
|
74
|
+
resolved_args = self._secret_manager.resolve_secrets_in_dict(
|
|
75
|
+
self.step.args, context
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# Combine resolved step args with message variables as runtime args
|
|
79
|
+
runtime_args = {
|
|
80
|
+
key: message.variables.get(key)
|
|
81
|
+
for key in message.variables.keys()
|
|
82
|
+
}
|
|
83
|
+
combined_args = {**resolved_args, **runtime_args}
|
|
84
|
+
|
|
85
|
+
# Instantiate the reader with combined arguments
|
|
86
|
+
loader = self.reader_class(**combined_args)
|
|
87
|
+
|
|
88
|
+
# Load documents using the loader
|
|
89
|
+
if not hasattr(loader, "load_data"):
|
|
90
|
+
raise AttributeError(
|
|
91
|
+
(
|
|
92
|
+
f"Reader class '{self.reader_class.__name__}' "
|
|
93
|
+
"does not have a 'load_data' method"
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
load_args = self.step.loader_args or {}
|
|
97
|
+
|
|
98
|
+
llama_documents = loader.load_data(**load_args)
|
|
99
|
+
|
|
100
|
+
# Convert LlamaIndex Documents to RAGDocuments
|
|
101
|
+
rag_documents = [
|
|
102
|
+
from_llama_document(doc) for doc in llama_documents
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
# Emit feedback about total documents loaded
|
|
106
|
+
await self.stream_emitter.status(
|
|
107
|
+
f"Loaded {len(rag_documents)} documents"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Yield one message per document (fan-out)
|
|
111
|
+
for doc in rag_documents:
|
|
112
|
+
yield message.copy_with_variables({output_id: doc})
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
# Emit error event to stream so frontend can display it
|
|
116
|
+
await self.stream_emitter.error(str(e))
|
|
117
|
+
message.set_error(self.step.id, e)
|
|
118
|
+
yield message
|