qtype 0.0.16__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qtype/application/commons/tools.py +1 -1
- qtype/application/converters/tools_from_api.py +5 -5
- qtype/application/converters/tools_from_module.py +2 -2
- qtype/application/converters/types.py +14 -43
- qtype/application/documentation.py +1 -1
- qtype/application/facade.py +92 -71
- qtype/base/types.py +227 -7
- qtype/commands/convert.py +20 -8
- qtype/commands/generate.py +19 -27
- qtype/commands/run.py +54 -36
- qtype/commands/serve.py +74 -54
- qtype/commands/validate.py +34 -8
- qtype/commands/visualize.py +46 -22
- qtype/dsl/__init__.py +6 -5
- qtype/dsl/custom_types.py +1 -1
- qtype/dsl/domain_types.py +65 -5
- qtype/dsl/linker.py +384 -0
- qtype/dsl/loader.py +315 -0
- qtype/dsl/model.py +612 -363
- qtype/dsl/parser.py +200 -0
- qtype/dsl/types.py +50 -0
- qtype/interpreter/api.py +57 -136
- qtype/interpreter/auth/aws.py +19 -9
- qtype/interpreter/auth/generic.py +93 -16
- qtype/interpreter/base/base_step_executor.py +429 -0
- qtype/interpreter/base/batch_step_executor.py +171 -0
- qtype/interpreter/base/exceptions.py +50 -0
- qtype/interpreter/base/executor_context.py +74 -0
- qtype/interpreter/base/factory.py +117 -0
- qtype/interpreter/base/progress_tracker.py +75 -0
- qtype/interpreter/base/secrets.py +339 -0
- qtype/interpreter/base/step_cache.py +73 -0
- qtype/interpreter/base/stream_emitter.py +469 -0
- qtype/interpreter/conversions.py +455 -21
- qtype/interpreter/converters.py +73 -0
- qtype/interpreter/endpoints.py +355 -0
- qtype/interpreter/executors/agent_executor.py +242 -0
- qtype/interpreter/executors/aggregate_executor.py +93 -0
- qtype/interpreter/executors/decoder_executor.py +163 -0
- qtype/interpreter/executors/doc_to_text_executor.py +112 -0
- qtype/interpreter/executors/document_embedder_executor.py +75 -0
- qtype/interpreter/executors/document_search_executor.py +122 -0
- qtype/interpreter/executors/document_source_executor.py +118 -0
- qtype/interpreter/executors/document_splitter_executor.py +105 -0
- qtype/interpreter/executors/echo_executor.py +63 -0
- qtype/interpreter/executors/field_extractor_executor.py +160 -0
- qtype/interpreter/executors/file_source_executor.py +101 -0
- qtype/interpreter/executors/file_writer_executor.py +110 -0
- qtype/interpreter/executors/index_upsert_executor.py +228 -0
- qtype/interpreter/executors/invoke_embedding_executor.py +92 -0
- qtype/interpreter/executors/invoke_flow_executor.py +51 -0
- qtype/interpreter/executors/invoke_tool_executor.py +353 -0
- qtype/interpreter/executors/llm_inference_executor.py +272 -0
- qtype/interpreter/executors/prompt_template_executor.py +78 -0
- qtype/interpreter/executors/sql_source_executor.py +106 -0
- qtype/interpreter/executors/vector_search_executor.py +91 -0
- qtype/interpreter/flow.py +147 -22
- qtype/interpreter/metadata_api.py +115 -0
- qtype/interpreter/resource_cache.py +5 -4
- qtype/interpreter/stream/chat/__init__.py +15 -0
- qtype/interpreter/stream/chat/converter.py +391 -0
- qtype/interpreter/{chat → stream/chat}/file_conversions.py +2 -2
- qtype/interpreter/stream/chat/ui_request_to_domain_type.py +140 -0
- qtype/interpreter/stream/chat/vercel.py +609 -0
- qtype/interpreter/stream/utils/__init__.py +15 -0
- qtype/interpreter/stream/utils/build_vercel_ai_formatter.py +74 -0
- qtype/interpreter/stream/utils/callback_to_stream.py +66 -0
- qtype/interpreter/stream/utils/create_streaming_response.py +18 -0
- qtype/interpreter/stream/utils/default_chat_extract_text.py +20 -0
- qtype/interpreter/stream/utils/error_streaming_response.py +20 -0
- qtype/interpreter/telemetry.py +135 -8
- qtype/interpreter/tools/__init__.py +5 -0
- qtype/interpreter/tools/function_tool_helper.py +265 -0
- qtype/interpreter/types.py +328 -0
- qtype/interpreter/typing.py +83 -89
- qtype/interpreter/ui/404/index.html +1 -1
- qtype/interpreter/ui/404.html +1 -1
- qtype/interpreter/ui/_next/static/{nUaw6_IwRwPqkzwe5s725 → 20HoJN6otZ_LyHLHpCPE6}/_buildManifest.js +1 -1
- qtype/interpreter/ui/_next/static/chunks/{393-8fd474427f8e19ce.js → 434-b2112d19f25c44ff.js} +3 -3
- qtype/interpreter/ui/_next/static/chunks/app/page-8c67d16ac90d23cb.js +1 -0
- qtype/interpreter/ui/_next/static/chunks/ba12c10f-546f2714ff8abc66.js +1 -0
- qtype/interpreter/ui/_next/static/css/8a8d1269e362fef7.css +3 -0
- qtype/interpreter/ui/icon.png +0 -0
- qtype/interpreter/ui/index.html +1 -1
- qtype/interpreter/ui/index.txt +4 -4
- qtype/semantic/checker.py +583 -0
- qtype/semantic/generate.py +262 -83
- qtype/semantic/loader.py +95 -0
- qtype/semantic/model.py +436 -159
- qtype/semantic/resolver.py +59 -17
- qtype/semantic/visualize.py +28 -31
- {qtype-0.0.16.dist-info → qtype-0.1.0.dist-info}/METADATA +16 -3
- qtype-0.1.0.dist-info/RECORD +134 -0
- qtype/dsl/base_types.py +0 -38
- qtype/dsl/validator.py +0 -465
- qtype/interpreter/batch/__init__.py +0 -0
- qtype/interpreter/batch/file_sink_source.py +0 -162
- qtype/interpreter/batch/flow.py +0 -95
- qtype/interpreter/batch/sql_source.py +0 -92
- qtype/interpreter/batch/step.py +0 -74
- qtype/interpreter/batch/types.py +0 -41
- qtype/interpreter/batch/utils.py +0 -178
- qtype/interpreter/chat/chat_api.py +0 -237
- qtype/interpreter/chat/vercel.py +0 -314
- qtype/interpreter/exceptions.py +0 -10
- qtype/interpreter/step.py +0 -67
- qtype/interpreter/steps/__init__.py +0 -0
- qtype/interpreter/steps/agent.py +0 -114
- qtype/interpreter/steps/condition.py +0 -36
- qtype/interpreter/steps/decoder.py +0 -88
- qtype/interpreter/steps/llm_inference.py +0 -171
- qtype/interpreter/steps/prompt_template.py +0 -54
- qtype/interpreter/steps/search.py +0 -24
- qtype/interpreter/steps/tool.py +0 -219
- qtype/interpreter/streaming_helpers.py +0 -123
- qtype/interpreter/ui/_next/static/chunks/app/page-7e26b6156cfb55d3.js +0 -1
- qtype/interpreter/ui/_next/static/chunks/ba12c10f-22556063851a6df2.js +0 -1
- qtype/interpreter/ui/_next/static/css/b40532b0db09cce3.css +0 -3
- qtype/interpreter/ui/favicon.ico +0 -0
- qtype/loader.py +0 -390
- qtype-0.0.16.dist-info/RECORD +0 -106
- /qtype/interpreter/ui/_next/static/{nUaw6_IwRwPqkzwe5s725 → 20HoJN6otZ_LyHLHpCPE6}/_ssgManifest.js +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.0.dist-info}/WHEEL +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.0.dist-info}/entry_points.txt +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.0.dist-info}/licenses/LICENSE +0 -0
- {qtype-0.0.16.dist-info → qtype-0.1.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
from typing import AsyncIterator
|
|
2
|
+
|
|
3
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
4
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
5
|
+
from qtype.interpreter.types import FlowMessage
|
|
6
|
+
from qtype.semantic.model import Echo
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class EchoExecutor(StepExecutor):
|
|
10
|
+
"""Executor for Echo steps.
|
|
11
|
+
|
|
12
|
+
Passes through input variables as outputs without modification.
|
|
13
|
+
Useful for debugging flows by inspecting variable values at specific
|
|
14
|
+
points in the execution pipeline.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
step: Echo,
|
|
20
|
+
context: ExecutorContext,
|
|
21
|
+
**dependencies: object,
|
|
22
|
+
):
|
|
23
|
+
super().__init__(step, context, **dependencies)
|
|
24
|
+
if not isinstance(step, Echo):
|
|
25
|
+
raise ValueError("EchoExecutor can only execute Echo steps.")
|
|
26
|
+
self.step: Echo = step
|
|
27
|
+
|
|
28
|
+
async def process_message(
|
|
29
|
+
self,
|
|
30
|
+
message: FlowMessage,
|
|
31
|
+
) -> AsyncIterator[FlowMessage]:
|
|
32
|
+
"""Process a single FlowMessage for the Echo step.
|
|
33
|
+
|
|
34
|
+
Reads all input variables from the message and copies them to
|
|
35
|
+
the output variables with the same IDs.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
message: The FlowMessage to process.
|
|
39
|
+
|
|
40
|
+
Yields:
|
|
41
|
+
FlowMessage with the echoed variables.
|
|
42
|
+
"""
|
|
43
|
+
try:
|
|
44
|
+
# Build a dict of output variable values by reading from inputs
|
|
45
|
+
output_vars = {}
|
|
46
|
+
for input_var in self.step.inputs:
|
|
47
|
+
value = message.variables.get(input_var.id)
|
|
48
|
+
# Find the corresponding output variable ID (should match)
|
|
49
|
+
for output_var in self.step.outputs:
|
|
50
|
+
if output_var.id == input_var.id:
|
|
51
|
+
output_vars[output_var.id] = value
|
|
52
|
+
break
|
|
53
|
+
|
|
54
|
+
await self.stream_emitter.status(
|
|
55
|
+
f"Echoed {len(output_vars)} variable(s) in step {self.step.id}",
|
|
56
|
+
)
|
|
57
|
+
yield message.copy_with_variables(output_vars)
|
|
58
|
+
|
|
59
|
+
except Exception as e:
|
|
60
|
+
# Emit error event to stream so frontend can display it
|
|
61
|
+
await self.stream_emitter.error(str(e))
|
|
62
|
+
message.set_error(self.step.id, e)
|
|
63
|
+
yield message
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from typing import Any, AsyncIterator
|
|
2
|
+
|
|
3
|
+
from jsonpath_ng.ext import parse # type: ignore[import-untyped]
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from qtype.base.types import PrimitiveTypeEnum
|
|
7
|
+
from qtype.dsl.model import ListType
|
|
8
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
9
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
10
|
+
from qtype.interpreter.types import FlowMessage
|
|
11
|
+
from qtype.semantic.model import FieldExtractor
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FieldExtractorExecutor(StepExecutor):
|
|
15
|
+
"""Executor for FieldExtractor steps.
|
|
16
|
+
|
|
17
|
+
Extracts fields from input data using JSONPath expressions and
|
|
18
|
+
constructs output instances. Supports 1-to-many cardinality when
|
|
19
|
+
the JSONPath matches multiple values.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
step: FieldExtractor,
|
|
25
|
+
context: ExecutorContext,
|
|
26
|
+
**dependencies: object,
|
|
27
|
+
):
|
|
28
|
+
super().__init__(step, context, **dependencies)
|
|
29
|
+
if not isinstance(step, FieldExtractor):
|
|
30
|
+
raise ValueError(
|
|
31
|
+
"FieldExtractorExecutor can only execute FieldExtractor steps."
|
|
32
|
+
)
|
|
33
|
+
self.step: FieldExtractor = step
|
|
34
|
+
|
|
35
|
+
# Parse the JSONPath expression once at initialization
|
|
36
|
+
try:
|
|
37
|
+
self.jsonpath_expr = parse(self.step.json_path)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"Invalid JSONPath expression '{self.step.json_path}': {e}"
|
|
41
|
+
) from e
|
|
42
|
+
|
|
43
|
+
def _to_dict(self, value: Any) -> Any:
|
|
44
|
+
"""Convert value to dict representation for JSONPath processing.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
value: The value to convert (could be BaseModel, dict, list, etc.)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Dict representation suitable for JSONPath processing
|
|
51
|
+
"""
|
|
52
|
+
if isinstance(value, BaseModel):
|
|
53
|
+
return value.model_dump()
|
|
54
|
+
return value
|
|
55
|
+
|
|
56
|
+
def _construct_output(self, extracted_data: Any) -> Any:
|
|
57
|
+
"""Construct the output value from extracted data.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
extracted_data: The data extracted by JSONPath
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Constructed output value based on the output variable type
|
|
64
|
+
"""
|
|
65
|
+
output_var = self.step.outputs[0]
|
|
66
|
+
output_type = output_var.type
|
|
67
|
+
|
|
68
|
+
# Handle primitive types - just return the extracted data
|
|
69
|
+
if isinstance(output_type, PrimitiveTypeEnum):
|
|
70
|
+
return extracted_data
|
|
71
|
+
|
|
72
|
+
# Handle list types
|
|
73
|
+
if isinstance(output_type, ListType):
|
|
74
|
+
# The extracted_data should already be a list
|
|
75
|
+
if not isinstance(extracted_data, list):
|
|
76
|
+
extracted_data = [extracted_data]
|
|
77
|
+
return extracted_data
|
|
78
|
+
|
|
79
|
+
# Handle BaseModel types (domain types and custom types)
|
|
80
|
+
if isinstance(output_type, type) and issubclass(
|
|
81
|
+
output_type, BaseModel
|
|
82
|
+
):
|
|
83
|
+
# If extracted_data is a dict, use it as kwargs
|
|
84
|
+
if isinstance(extracted_data, dict):
|
|
85
|
+
return output_type(**extracted_data)
|
|
86
|
+
# If it's already the right type, return it
|
|
87
|
+
elif isinstance(extracted_data, output_type):
|
|
88
|
+
return extracted_data
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
(
|
|
92
|
+
f"Cannot construct {output_type.__name__} from "
|
|
93
|
+
f"{type(extracted_data).__name__}"
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# Fallback - return as-is
|
|
98
|
+
return extracted_data
|
|
99
|
+
|
|
100
|
+
async def process_message(
|
|
101
|
+
self,
|
|
102
|
+
message: FlowMessage,
|
|
103
|
+
) -> AsyncIterator[FlowMessage]:
|
|
104
|
+
"""Process a single FlowMessage for the FieldExtractor step.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
message: The FlowMessage to process.
|
|
108
|
+
|
|
109
|
+
Yields:
|
|
110
|
+
FlowMessage(s) with extracted and constructed output values.
|
|
111
|
+
Multiple messages may be yielded if JSONPath matches multiple values.
|
|
112
|
+
"""
|
|
113
|
+
input_id = self.step.inputs[0].id
|
|
114
|
+
output_id = self.step.outputs[0].id
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
# Get the input value
|
|
118
|
+
input_value = message.variables.get(input_id)
|
|
119
|
+
if input_value is None:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Input variable '{input_id}' is not set or is None"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
await self.stream_emitter.status(
|
|
125
|
+
f"Extracting fields using JSONPath: {self.step.json_path}"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Convert input to dict for JSONPath processing
|
|
129
|
+
input_dict = self._to_dict(input_value)
|
|
130
|
+
|
|
131
|
+
# Apply JSONPath expression
|
|
132
|
+
matches = self.jsonpath_expr.find(input_dict)
|
|
133
|
+
|
|
134
|
+
if not matches:
|
|
135
|
+
raise ValueError(
|
|
136
|
+
(
|
|
137
|
+
f"JSONPath expression '{self.step.json_path}' "
|
|
138
|
+
f"did not match any data in input"
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
await self.stream_emitter.status(
|
|
143
|
+
f"JSONPath matched {len(matches)} value(s)"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Yield one message per match (1-to-many)
|
|
147
|
+
for match in matches:
|
|
148
|
+
extracted_data = match.value
|
|
149
|
+
|
|
150
|
+
# Construct the output value
|
|
151
|
+
output_value = self._construct_output(extracted_data)
|
|
152
|
+
|
|
153
|
+
# Yield message with the constructed output
|
|
154
|
+
yield message.copy_with_variables({output_id: output_value})
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
# Emit error event to stream so frontend can display it
|
|
158
|
+
await self.stream_emitter.error(str(e))
|
|
159
|
+
message.set_error(self.step.id, e)
|
|
160
|
+
yield message
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import AsyncIterator
|
|
5
|
+
|
|
6
|
+
import fsspec
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from qtype.interpreter.base.base_step_executor import StepExecutor
|
|
10
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
11
|
+
from qtype.interpreter.types import FlowMessage
|
|
12
|
+
from qtype.semantic.model import ConstantPath, FileSource
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FileSourceExecutor(StepExecutor):
|
|
16
|
+
"""Executor for FileSource steps."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self, step: FileSource, context: ExecutorContext, **dependencies
|
|
20
|
+
):
|
|
21
|
+
super().__init__(step, context, **dependencies)
|
|
22
|
+
if not isinstance(step, FileSource):
|
|
23
|
+
raise ValueError(
|
|
24
|
+
"FileSourceExecutor can only execute FileSource steps."
|
|
25
|
+
)
|
|
26
|
+
self.step = step
|
|
27
|
+
|
|
28
|
+
async def process_message(
|
|
29
|
+
self,
|
|
30
|
+
message: FlowMessage,
|
|
31
|
+
) -> AsyncIterator[FlowMessage]:
|
|
32
|
+
"""Process a single FlowMessage for the FileSource step.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
message: The FlowMessage to process.
|
|
36
|
+
|
|
37
|
+
Yields:
|
|
38
|
+
FlowMessages with the results of processing.
|
|
39
|
+
"""
|
|
40
|
+
output_columns = {output.id for output in self.step.outputs}
|
|
41
|
+
|
|
42
|
+
# get the path
|
|
43
|
+
if isinstance(self.step.path, ConstantPath):
|
|
44
|
+
file_path = self.step.path
|
|
45
|
+
else:
|
|
46
|
+
file_path = message.variables.get(self.step.path.id)
|
|
47
|
+
if not file_path:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
(
|
|
50
|
+
f"FileSource step {self.step.id} requires a path "
|
|
51
|
+
"variable."
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
await self.stream_emitter.status(
|
|
55
|
+
f"Reading file from path: {file_path}"
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Determine file format from extension
|
|
59
|
+
file_path_str = (
|
|
60
|
+
file_path.uri if isinstance(file_path, ConstantPath) else file_path
|
|
61
|
+
)
|
|
62
|
+
extension = Path(file_path_str).suffix.lower()
|
|
63
|
+
|
|
64
|
+
# Use fsspec to open the file and read with pandas
|
|
65
|
+
with fsspec.open(file_path_str, "rb") as file_handle:
|
|
66
|
+
if extension == ".csv":
|
|
67
|
+
df = pd.read_csv(file_handle) # type: ignore[arg-type]
|
|
68
|
+
elif extension == ".parquet":
|
|
69
|
+
df = pd.read_parquet(file_handle) # type: ignore[arg-type]
|
|
70
|
+
elif extension == ".json":
|
|
71
|
+
df = pd.read_json(file_handle) # type: ignore[arg-type]
|
|
72
|
+
elif extension == ".jsonl":
|
|
73
|
+
df = pd.read_json(
|
|
74
|
+
file_handle,
|
|
75
|
+
lines=True, # type: ignore[arg-type]
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
# Default to parquet if no extension or unknown
|
|
79
|
+
df = pd.read_parquet(file_handle) # type: ignore[arg-type]
|
|
80
|
+
|
|
81
|
+
# confirm the outputs exist in the dataframe
|
|
82
|
+
columns = set(df.columns)
|
|
83
|
+
missing_columns = output_columns - columns
|
|
84
|
+
if missing_columns:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
(
|
|
87
|
+
f"File {file_path_str} missing expected columns: "
|
|
88
|
+
f"{', '.join(missing_columns)}. Available columns: "
|
|
89
|
+
f"{', '.join(columns)}"
|
|
90
|
+
)
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
for row in df.to_dict(orient="records"):
|
|
94
|
+
# Filter to only the expected output columns if they exist
|
|
95
|
+
row = {
|
|
96
|
+
str(k): v for k, v in row.items() if str(k) in output_columns
|
|
97
|
+
}
|
|
98
|
+
yield message.copy_with_variables(new_variables=row)
|
|
99
|
+
await self.stream_emitter.status(
|
|
100
|
+
f"Emitted {len(df)} rows from: {file_path_str}"
|
|
101
|
+
)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from itertools import groupby
|
|
2
|
+
from typing import AsyncIterator
|
|
3
|
+
|
|
4
|
+
import fsspec
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from qtype.interpreter.base.batch_step_executor import BatchedStepExecutor
|
|
8
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
9
|
+
from qtype.interpreter.types import FlowMessage
|
|
10
|
+
from qtype.semantic.model import ConstantPath, FileWriter, Variable
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class FileWriterExecutor(BatchedStepExecutor):
|
|
14
|
+
"""Executor for FileWriter steps."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
step: FileWriter,
|
|
19
|
+
context: ExecutorContext,
|
|
20
|
+
**dependencies,
|
|
21
|
+
):
|
|
22
|
+
super().__init__(step, context, **dependencies)
|
|
23
|
+
if not isinstance(step, FileWriter):
|
|
24
|
+
raise ValueError(
|
|
25
|
+
"FileWriterExecutor can only execute FileWriter steps."
|
|
26
|
+
)
|
|
27
|
+
self.step = step
|
|
28
|
+
|
|
29
|
+
def to_pandas(self, messages: list[FlowMessage]) -> pd.DataFrame:
|
|
30
|
+
"""Convert a list of FlowMessages to a pandas DataFrame."""
|
|
31
|
+
records = [msg.variables for msg in messages]
|
|
32
|
+
return pd.DataFrame.from_records(records)
|
|
33
|
+
|
|
34
|
+
async def process_batch(
|
|
35
|
+
self,
|
|
36
|
+
batch: list[FlowMessage],
|
|
37
|
+
) -> AsyncIterator[FlowMessage]:
|
|
38
|
+
"""Process a batch of FlowMessages for the FileWriter step.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
batch: A list of FlowMessages to process.
|
|
42
|
+
|
|
43
|
+
Yields:
|
|
44
|
+
FlowMessages with the results of processing.
|
|
45
|
+
"""
|
|
46
|
+
output_name = None
|
|
47
|
+
if len(self.step.outputs):
|
|
48
|
+
output_name = self.step.outputs[0].id
|
|
49
|
+
|
|
50
|
+
if isinstance(self.step.path, ConstantPath):
|
|
51
|
+
file_path = self.step.path.uri
|
|
52
|
+
df = self.to_pandas(batch)
|
|
53
|
+
# A fixed path is provided -- just write all of the data
|
|
54
|
+
await self.stream_emitter.status(
|
|
55
|
+
f"Writing {len(df)} records to {file_path}"
|
|
56
|
+
)
|
|
57
|
+
with fsspec.open(file_path, "wb") as file_handle:
|
|
58
|
+
df.to_parquet(file_handle, index=False) # type: ignore[arg-type]
|
|
59
|
+
await self.stream_emitter.status(
|
|
60
|
+
f"Wrote {len(df)} records to {file_path}"
|
|
61
|
+
)
|
|
62
|
+
for msg in batch:
|
|
63
|
+
yield (
|
|
64
|
+
msg
|
|
65
|
+
if not output_name
|
|
66
|
+
else msg.copy_with_variables({output_name: file_path})
|
|
67
|
+
)
|
|
68
|
+
else:
|
|
69
|
+
# Group messages by file path (path is a Variable in this branch)
|
|
70
|
+
if not isinstance(self.step.path, Variable):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"Expected path to be a Variable in dynamic path case."
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
path_var_id = self.step.path.id
|
|
76
|
+
|
|
77
|
+
# Sort messages by file path for groupby
|
|
78
|
+
sorted_batch = sorted(
|
|
79
|
+
batch, key=lambda msg: msg.variables[path_var_id]
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Group messages by file path
|
|
83
|
+
grouped_messages = groupby(
|
|
84
|
+
sorted_batch, key=lambda msg: msg.variables[path_var_id]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
distinct_paths = list(
|
|
88
|
+
set(msg.variables[path_var_id] for msg in batch)
|
|
89
|
+
)
|
|
90
|
+
await self.stream_emitter.status(
|
|
91
|
+
f"There are {len(distinct_paths)} different files to write."
|
|
92
|
+
)
|
|
93
|
+
for file_path, msg_group in grouped_messages:
|
|
94
|
+
msg_list = list(msg_group)
|
|
95
|
+
df_group = self.to_pandas(msg_list)
|
|
96
|
+
await self.stream_emitter.status(
|
|
97
|
+
f"Writing {len(df_group)} records to {file_path}"
|
|
98
|
+
)
|
|
99
|
+
with fsspec.open(file_path, "wb") as file_handle:
|
|
100
|
+
df_group.to_parquet(file_handle, index=False) # type: ignore[arg-type]
|
|
101
|
+
await self.stream_emitter.status(
|
|
102
|
+
f"Wrote {len(df_group)} records to {file_path}"
|
|
103
|
+
)
|
|
104
|
+
# Re-yield the original messages for this group
|
|
105
|
+
for msg in batch:
|
|
106
|
+
yield (
|
|
107
|
+
msg
|
|
108
|
+
if not output_name
|
|
109
|
+
else msg.copy_with_variables({output_name: file_path})
|
|
110
|
+
)
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""Index upsert executor for inserting documents/chunks into indexes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import AsyncIterator
|
|
7
|
+
|
|
8
|
+
from llama_index.core.schema import TextNode
|
|
9
|
+
|
|
10
|
+
from qtype.dsl.domain_types import RAGChunk, RAGDocument
|
|
11
|
+
from qtype.interpreter.base.batch_step_executor import BatchedStepExecutor
|
|
12
|
+
from qtype.interpreter.base.executor_context import ExecutorContext
|
|
13
|
+
from qtype.interpreter.conversions import (
|
|
14
|
+
to_llama_vector_store_and_retriever,
|
|
15
|
+
to_opensearch_client,
|
|
16
|
+
)
|
|
17
|
+
from qtype.interpreter.types import FlowMessage
|
|
18
|
+
from qtype.semantic.model import DocumentIndex, IndexUpsert, VectorIndex
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class IndexUpsertExecutor(BatchedStepExecutor):
|
|
24
|
+
"""Executor for IndexUpsert steps supporting both vector and document indexes."""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self, step: IndexUpsert, context: ExecutorContext, **dependencies
|
|
28
|
+
):
|
|
29
|
+
super().__init__(step, context, **dependencies)
|
|
30
|
+
if not isinstance(step, IndexUpsert):
|
|
31
|
+
raise ValueError(
|
|
32
|
+
"IndexUpsertExecutor can only execute IndexUpsert steps."
|
|
33
|
+
)
|
|
34
|
+
self.step: IndexUpsert = step
|
|
35
|
+
|
|
36
|
+
# Determine index type and initialize appropriate client
|
|
37
|
+
if isinstance(self.step.index, VectorIndex):
|
|
38
|
+
# Vector index for RAGChunk embeddings
|
|
39
|
+
self._vector_store, _ = to_llama_vector_store_and_retriever(
|
|
40
|
+
self.step.index, self.context.secret_manager
|
|
41
|
+
)
|
|
42
|
+
self._opensearch_client = None
|
|
43
|
+
self.index_type = "vector"
|
|
44
|
+
elif isinstance(self.step.index, DocumentIndex):
|
|
45
|
+
# Document index for text-based search
|
|
46
|
+
self._opensearch_client = to_opensearch_client(
|
|
47
|
+
self.step.index, self.context.secret_manager
|
|
48
|
+
)
|
|
49
|
+
self._vector_store = None
|
|
50
|
+
self.index_type = "document"
|
|
51
|
+
self.index_name = self.step.index.name
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Unsupported index type: {type(self.step.index)}"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
async def process_batch(
|
|
58
|
+
self, batch: list[FlowMessage]
|
|
59
|
+
) -> AsyncIterator[FlowMessage]:
|
|
60
|
+
"""Process a batch of FlowMessages for the IndexUpsert step.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
batch: A list of FlowMessages to process.
|
|
64
|
+
|
|
65
|
+
Yields:
|
|
66
|
+
FlowMessages: Success messages after upserting to the index
|
|
67
|
+
"""
|
|
68
|
+
logger.info(
|
|
69
|
+
f"Executing IndexUpsert step: {self.step.id} with batch size: {len(batch)}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Get the input variable (exactly one as validated by checker)
|
|
74
|
+
if not self.step.inputs:
|
|
75
|
+
raise ValueError("IndexUpsert step requires exactly one input")
|
|
76
|
+
|
|
77
|
+
input_var = self.step.inputs[0]
|
|
78
|
+
|
|
79
|
+
# Collect all RAGChunks or RAGDocuments from the batch
|
|
80
|
+
items_to_upsert = []
|
|
81
|
+
for message in batch:
|
|
82
|
+
input_data = message.variables.get(input_var.id)
|
|
83
|
+
|
|
84
|
+
if input_data is None:
|
|
85
|
+
logger.warning(
|
|
86
|
+
f"No data found for input: {input_var.id} in message"
|
|
87
|
+
)
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
if not isinstance(input_data, (RAGChunk, RAGDocument)):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"IndexUpsert only supports RAGChunk or RAGDocument "
|
|
93
|
+
f"inputs. Got: {type(input_data)}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
items_to_upsert.append(input_data)
|
|
97
|
+
|
|
98
|
+
# Upsert to appropriate index type
|
|
99
|
+
if items_to_upsert:
|
|
100
|
+
if self.index_type == "vector":
|
|
101
|
+
await self._upsert_to_vector_store(items_to_upsert)
|
|
102
|
+
else: # document index
|
|
103
|
+
await self._upsert_to_document_index(items_to_upsert)
|
|
104
|
+
|
|
105
|
+
logger.info(
|
|
106
|
+
f"Successfully upserted {len(items_to_upsert)} items "
|
|
107
|
+
f"to {self.index_type} index in batch"
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
# Emit status update
|
|
111
|
+
index_type_display = (
|
|
112
|
+
"vector index"
|
|
113
|
+
if self.index_type == "vector"
|
|
114
|
+
else "document index"
|
|
115
|
+
)
|
|
116
|
+
await self.stream_emitter.status(
|
|
117
|
+
f"Upserted {len(items_to_upsert)} items to "
|
|
118
|
+
f"{index_type_display}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Yield all input messages back (IndexUpsert typically doesn't have outputs)
|
|
122
|
+
for message in batch:
|
|
123
|
+
yield message
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
logger.error(f"Error in IndexUpsert step {self.step.id}: {e}")
|
|
127
|
+
# Emit error event to stream so frontend can display it
|
|
128
|
+
await self.stream_emitter.error(str(e))
|
|
129
|
+
|
|
130
|
+
# Mark all messages with the error and yield them
|
|
131
|
+
for message in batch:
|
|
132
|
+
message.set_error(self.step.id, e)
|
|
133
|
+
yield message
|
|
134
|
+
|
|
135
|
+
async def _upsert_to_vector_store(
|
|
136
|
+
self, items: list[RAGChunk | RAGDocument]
|
|
137
|
+
) -> None:
|
|
138
|
+
"""Upsert items to vector store.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
items: List of RAGChunk or RAGDocument objects
|
|
142
|
+
"""
|
|
143
|
+
# Convert to LlamaIndex TextNode objects
|
|
144
|
+
nodes = []
|
|
145
|
+
for item in items:
|
|
146
|
+
if isinstance(item, RAGChunk):
|
|
147
|
+
node = TextNode(
|
|
148
|
+
id_=item.chunk_id,
|
|
149
|
+
text=str(item.content),
|
|
150
|
+
metadata=item.metadata,
|
|
151
|
+
embedding=item.vector,
|
|
152
|
+
)
|
|
153
|
+
else: # RAGDocument
|
|
154
|
+
# For documents, use file_id and convert content to string
|
|
155
|
+
node = TextNode(
|
|
156
|
+
id_=item.file_id,
|
|
157
|
+
text=str(item.content),
|
|
158
|
+
metadata=item.metadata,
|
|
159
|
+
embedding=None, # Documents don't have embeddings
|
|
160
|
+
)
|
|
161
|
+
nodes.append(node)
|
|
162
|
+
|
|
163
|
+
# Batch upsert all nodes to the vector store
|
|
164
|
+
await self._vector_store.async_add(nodes)
|
|
165
|
+
|
|
166
|
+
async def _upsert_to_document_index(
|
|
167
|
+
self, items: list[RAGChunk | RAGDocument]
|
|
168
|
+
) -> None:
|
|
169
|
+
"""Upsert items to document index using bulk API.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
items: List of RAGChunk or RAGDocument objects
|
|
173
|
+
"""
|
|
174
|
+
# Build bulk request body
|
|
175
|
+
bulk_body = []
|
|
176
|
+
for item in items:
|
|
177
|
+
if isinstance(item, RAGChunk):
|
|
178
|
+
# Add index action
|
|
179
|
+
bulk_body.append(
|
|
180
|
+
{
|
|
181
|
+
"index": {
|
|
182
|
+
"_index": self.index_name,
|
|
183
|
+
"_id": item.chunk_id,
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
)
|
|
187
|
+
# Add document content
|
|
188
|
+
doc = {
|
|
189
|
+
"text": str(item.content),
|
|
190
|
+
"metadata": item.metadata,
|
|
191
|
+
}
|
|
192
|
+
# Include embedding if available
|
|
193
|
+
if item.vector:
|
|
194
|
+
doc["embedding"] = item.vector
|
|
195
|
+
bulk_body.append(doc)
|
|
196
|
+
else: # RAGDocument
|
|
197
|
+
# Add index action
|
|
198
|
+
bulk_body.append(
|
|
199
|
+
{
|
|
200
|
+
"index": {
|
|
201
|
+
"_index": self.index_name,
|
|
202
|
+
"_id": item.file_id,
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
)
|
|
206
|
+
# Add document content
|
|
207
|
+
doc = {
|
|
208
|
+
"text": str(item.content),
|
|
209
|
+
"metadata": item.metadata,
|
|
210
|
+
"file_name": item.file_name,
|
|
211
|
+
}
|
|
212
|
+
if item.uri:
|
|
213
|
+
doc["uri"] = item.uri
|
|
214
|
+
bulk_body.append(doc)
|
|
215
|
+
|
|
216
|
+
# Execute bulk request
|
|
217
|
+
response = self._opensearch_client.bulk(body=bulk_body)
|
|
218
|
+
|
|
219
|
+
# Check for errors
|
|
220
|
+
if response.get("errors"):
|
|
221
|
+
error_items = [
|
|
222
|
+
item
|
|
223
|
+
for item in response["items"]
|
|
224
|
+
if "error" in item.get("index", {})
|
|
225
|
+
]
|
|
226
|
+
logger.warning(
|
|
227
|
+
f"Bulk upsert had {len(error_items)} errors: {error_items}"
|
|
228
|
+
)
|