PyPI - qtype - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

qtype 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

qtype/application/facade.py +16 -17
qtype/cli.py +5 -1
qtype/commands/generate.py +1 -1
qtype/commands/run.py +28 -5
qtype/dsl/domain_types.py +24 -3
qtype/dsl/model.py +56 -3
qtype/interpreter/base/base_step_executor.py +8 -1
qtype/interpreter/base/executor_context.py +18 -1
qtype/interpreter/base/factory.py +33 -66
qtype/interpreter/base/progress_tracker.py +35 -0
qtype/interpreter/base/step_cache.py +3 -2
qtype/interpreter/conversions.py +34 -19
qtype/interpreter/converters.py +19 -13
qtype/interpreter/executors/bedrock_reranker_executor.py +195 -0
qtype/interpreter/executors/document_embedder_executor.py +36 -4
qtype/interpreter/executors/document_search_executor.py +37 -46
qtype/interpreter/executors/document_splitter_executor.py +1 -1
qtype/interpreter/executors/field_extractor_executor.py +10 -5
qtype/interpreter/executors/index_upsert_executor.py +115 -111
qtype/interpreter/executors/invoke_embedding_executor.py +2 -2
qtype/interpreter/executors/invoke_tool_executor.py +6 -1
qtype/interpreter/flow.py +47 -32
qtype/interpreter/rich_progress.py +225 -0
qtype/interpreter/types.py +2 -0
qtype/semantic/checker.py +79 -19
qtype/semantic/model.py +43 -3
qtype/semantic/resolver.py +4 -2
{qtype-0.1.0.dist-info → qtype-0.1.2.dist-info}/METADATA +12 -11
{qtype-0.1.0.dist-info → qtype-0.1.2.dist-info}/RECORD +33 -31
{qtype-0.1.0.dist-info → qtype-0.1.2.dist-info}/WHEEL +0 -0
{qtype-0.1.0.dist-info → qtype-0.1.2.dist-info}/entry_points.txt +0 -0
{qtype-0.1.0.dist-info → qtype-0.1.2.dist-info}/licenses/LICENSE +0 -0
{qtype-0.1.0.dist-info → qtype-0.1.2.dist-info}/top_level.txt +0 -0

qtype/interpreter/executors/index_upsert_executor.py CHANGED Viewed

@@ -3,9 +3,12 @@
 from __future__ import annotations
 import logging
+import uuid
 from typing import AsyncIterator
 from llama_index.core.schema import TextNode
+from opensearchpy import AsyncOpenSearch
+from pydantic import BaseModel
 from qtype.dsl.domain_types import RAGChunk, RAGDocument
 from qtype.interpreter.base.batch_step_executor import BatchedStepExecutor
@@ -39,21 +42,32 @@ class IndexUpsertExecutor(BatchedStepExecutor):
             self._vector_store, _ = to_llama_vector_store_and_retriever(
                 self.step.index, self.context.secret_manager
             )
-            self._opensearch_client = None
             self.index_type = "vector"
         elif isinstance(self.step.index, DocumentIndex):
             # Document index for text-based search
-            self._opensearch_client = to_opensearch_client(
+            self._opensearch_client: AsyncOpenSearch = to_opensearch_client(
                 self.step.index, self.context.secret_manager
             )
             self._vector_store = None
             self.index_type = "document"
             self.index_name = self.step.index.name
+            self._document_index: DocumentIndex = self.step.index
         else:
             raise ValueError(
                 f"Unsupported index type: {type(self.step.index)}"
             )
+    async def finalize(self) -> AsyncIterator[FlowMessage]:
+        """Clean up resources after all messages are processed."""
+        if hasattr(self, "_opensearch_client") and self._opensearch_client:
+            try:
+                await self._opensearch_client.close()
+            except Exception:
+                pass
+        # Make this an async generator
+        return
+        yield  # type: ignore[unreachable]
     async def process_batch(
         self, batch: list[FlowMessage]
     ) -> AsyncIterator[FlowMessage]:
@@ -65,61 +79,18 @@ class IndexUpsertExecutor(BatchedStepExecutor):
         Yields:
             FlowMessages: Success messages after upserting to the index
         """
-        logger.info(
+        logger.debug(
             f"Executing IndexUpsert step: {self.step.id} with batch size: {len(batch)}"
         )
+        if len(batch) == 0:
+            return
         try:
-            # Get the input variable (exactly one as validated by checker)
-            if not self.step.inputs:
-                raise ValueError("IndexUpsert step requires exactly one input")
-            input_var = self.step.inputs[0]
-            # Collect all RAGChunks or RAGDocuments from the batch
-            items_to_upsert = []
-            for message in batch:
-                input_data = message.variables.get(input_var.id)
-                if input_data is None:
-                    logger.warning(
-                        f"No data found for input: {input_var.id} in message"
-                    )
-                    continue
-                if not isinstance(input_data, (RAGChunk, RAGDocument)):
-                    raise ValueError(
-                        f"IndexUpsert only supports RAGChunk or RAGDocument "
-                        f"inputs. Got: {type(input_data)}"
-                    )
-                items_to_upsert.append(input_data)
-            # Upsert to appropriate index type
-            if items_to_upsert:
-                if self.index_type == "vector":
-                    await self._upsert_to_vector_store(items_to_upsert)
-                else:  # document index
-                    await self._upsert_to_document_index(items_to_upsert)
-                logger.info(
-                    f"Successfully upserted {len(items_to_upsert)} items "
-                    f"to {self.index_type} index in batch"
-                )
-                # Emit status update
-                index_type_display = (
-                    "vector index"
-                    if self.index_type == "vector"
-                    else "document index"
-                )
-                await self.stream_emitter.status(
-                    f"Upserted {len(items_to_upsert)} items to "
-                    f"{index_type_display}"
-                )
-            # Yield all input messages back (IndexUpsert typically doesn't have outputs)
-            for message in batch:
+            if self.index_type == "vector":
+                result_iter = self._upsert_to_vector_store(batch)
+            else:
+                result_iter = self._upsert_to_document_index(batch)
+            async for message in result_iter:
                 yield message
         except Exception as e:
@@ -133,13 +104,27 @@ class IndexUpsertExecutor(BatchedStepExecutor):
                 yield message
     async def _upsert_to_vector_store(
-        self, items: list[RAGChunk | RAGDocument]
-    ) -> None:
+        self, batch: list[FlowMessage]
+    ) -> AsyncIterator[FlowMessage]:
         """Upsert items to vector store.
         Args:
             items: List of RAGChunk or RAGDocument objects
         """
+        # safe since semantic validation checks input length
+        input_var = self.step.inputs[0]
+        # Collect all RAGChunks or RAGDocuments from the batch inputs
+        items = []
+        for message in batch:
+            input_data = message.variables.get(input_var.id)
+            if not isinstance(input_data, (RAGChunk, RAGDocument)):
+                raise ValueError(
+                    f"IndexUpsert only supports RAGChunk or RAGDocument "
+                    f"inputs. Got: {type(input_data)}"
+                )
+            items.append(input_data)
         # Convert to LlamaIndex TextNode objects
         nodes = []
         for item in items:
@@ -162,67 +147,86 @@ class IndexUpsertExecutor(BatchedStepExecutor):
         # Batch upsert all nodes to the vector store
         await self._vector_store.async_add(nodes)
+        num_inserted = len(items)
+        # Emit status update
+        await self.stream_emitter.status(
+            f"Upserted {num_inserted} items to index {self.step.index.name}"
+        )
+        for message in batch:
+            yield message
     async def _upsert_to_document_index(
-        self, items: list[RAGChunk | RAGDocument]
-    ) -> None:
+        self, batch: list[FlowMessage]
+    ) -> AsyncIterator[FlowMessage]:
         """Upsert items to document index using bulk API.
         Args:
-            items: List of RAGChunk or RAGDocument objects
+            batch: List of FlowMessages containing documents to upsert
         """
-        # Build bulk request body
         bulk_body = []
-        for item in items:
-            if isinstance(item, RAGChunk):
-                # Add index action
-                bulk_body.append(
-                    {
-                        "index": {
-                            "_index": self.index_name,
-                            "_id": item.chunk_id,
-                        }
-                    }
-                )
-                # Add document content
-                doc = {
-                    "text": str(item.content),
-                    "metadata": item.metadata,
-                }
-                # Include embedding if available
-                if item.vector:
-                    doc["embedding"] = item.vector
-                bulk_body.append(doc)
-            else:  # RAGDocument
-                # Add index action
-                bulk_body.append(
-                    {
-                        "index": {
-                            "_index": self.index_name,
-                            "_id": item.file_id,
-                        }
-                    }
-                )
-                # Add document content
-                doc = {
-                    "text": str(item.content),
-                    "metadata": item.metadata,
-                    "file_name": item.file_name,
-                }
-                if item.uri:
-                    doc["uri"] = item.uri
-                bulk_body.append(doc)
-        # Execute bulk request
-        response = self._opensearch_client.bulk(body=bulk_body)
-        # Check for errors
-        if response.get("errors"):
-            error_items = [
-                item
-                for item in response["items"]
-                if "error" in item.get("index", {})
-            ]
-            logger.warning(
-                f"Bulk upsert had {len(error_items)} errors: {error_items}"
+        message_by_id: dict[str, FlowMessage] = {}
+        for message in batch:
+            # Collect all input variables into a single document dict
+            doc_dict = {}
+            for input_var in self.step.inputs:
+                value = message.variables.get(input_var.id)
+                # Convert to dict if it's a Pydantic model
+                if isinstance(value, BaseModel):
+                    value = value.model_dump()
+                # Merge into document dict
+                if isinstance(value, dict):
+                    doc_dict.update(value)
+                else:
+                    # Primitive types - use variable name as field name
+                    doc_dict[input_var.id] = value
+            # Determine the document id field
+            id_field = None
+            if self._document_index.id_field is not None:
+                id_field = self._document_index.id_field
+                if id_field not in doc_dict:
+                    raise ValueError(
+                        f"Specified id_field '{id_field}' not found in inputs"
+                    )
+            else:
+                # Auto-detect with fallback
+                for field in ["_id", "id", "doc_id", "document_id"]:
+                    if field in doc_dict:
+                        id_field = field
+                        break
+            if id_field is not None:
+                doc_id = str(doc_dict[id_field])
+            else:
+                # Generate a UUID if no id field found
+                doc_id = str(uuid.uuid4())
+            # Add bulk action and document
+            bulk_body.append(
+                {"index": {"_index": self.index_name, "_id": doc_id}}
             )
+            bulk_body.append(doc_dict)
+            message_by_id[doc_id] = message
+        # Execute bulk request asynchronously
+        response = await self._opensearch_client.bulk(body=bulk_body)
+        num_inserted = 0
+        for item in response["items"]:
+            doc_id = item["index"]["_id"]
+            message = message_by_id[doc_id]
+            if "error" in item.get("index", {}):
+                message.set_error(
+                    self.step.id,
+                    Exception(item["index"]["error"]),
+                )
+            else:
+                num_inserted += 1
+            yield message
+        await self.stream_emitter.status(
+            f"Upserted {num_inserted} items to index {self.step.index.name}, {len(batch) - num_inserted} errors occurred."
+        )

qtype/interpreter/executors/invoke_embedding_executor.py CHANGED Viewed

@@ -58,13 +58,13 @@ class InvokeEmbeddingExecutor(StepExecutor):
             if input_type == PrimitiveTypeEnum.text:
                 if not isinstance(input_value, str):
                     input_value = str(input_value)
-                vector = self.embedding_model.get_text_embedding(
+                vector = await self.embedding_model.aget_text_embedding(
                     text=input_value
                 )
                 content = input_value
             elif input_type == PrimitiveTypeEnum.image:
                 # For image embeddings
-                vector = self.embedding_model.get_image_embedding(
+                vector = await self.embedding_model.aget_image_embedding(
                     image_path=input_value
                 )
                 content = input_value

qtype/interpreter/executors/invoke_tool_executor.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
+import asyncio
 import importlib
+import inspect
 import logging
 import time
 import uuid
@@ -86,7 +88,10 @@ class ToolExecutionMixin:
                         )
                     )
-                result = function(**inputs)
+                if inspect.iscoroutinefunction(function):
+                    result = await function(**inputs)
+                else:
+                    result = await asyncio.to_thread(function, **inputs)
                 await tool_ctx.complete(result)
                 return result

qtype/interpreter/flow.py CHANGED Viewed

@@ -1,6 +1,8 @@
 from __future__ import annotations
+import json
 import logging
+from collections.abc import AsyncIterator
 from openinference.semconv.trace import (
     OpenInferenceSpanKindValues,
@@ -12,6 +14,7 @@ from opentelemetry.trace import Status, StatusCode
 from qtype.interpreter.base import factory
 from qtype.interpreter.base.executor_context import ExecutorContext
+from qtype.interpreter.rich_progress import RichProgressCallback
 from qtype.interpreter.types import FlowMessage
 from qtype.semantic.model import Flow
@@ -19,7 +22,10 @@ logger = logging.getLogger(__name__)
 async def run_flow(
-    flow: Flow, initial: list[FlowMessage] | FlowMessage, **kwargs
+    flow: Flow,
+    initial: list[FlowMessage] | AsyncIterator[FlowMessage] | FlowMessage,
+    show_progress: bool = False,
+    **kwargs,
 ) -> list[FlowMessage]:
     """
     Main entrypoint for executing a flow.
@@ -38,11 +44,16 @@ async def run_flow(
     # Extract or create ExecutorContext
     exec_context = kwargs.pop("context", None)
+    progress_callback = RichProgressCallback() if show_progress else None
     if exec_context is None:
         exec_context = ExecutorContext(
             secret_manager=NoOpSecretManager(),
             tracer=trace.get_tracer(__name__),
+            on_progress=progress_callback,
         )
+    else:
+        if exec_context.on_progress is None and show_progress:
+            exec_context.on_progress = progress_callback
     # Use tracer from context
     tracer = exec_context.tracer or trace.get_tracer(__name__)
@@ -68,38 +79,38 @@ async def run_flow(
         # 1. Get the execution plan is just the steps in order
         execution_plan = flow.steps
-        # 2. Initialize the stream
-        if not isinstance(initial, list):
+        # 2. Convert the initial input to an iterable of some kind. Record telemetry if possible.
+        if isinstance(initial, FlowMessage):
+            span.set_attribute("flow.input_count", 1)
+            input_vars = {k: v for k, v in initial.variables.items()}
+            span.set_attribute(
+                SpanAttributes.INPUT_VALUE,
+                json.dumps(input_vars, default=str),
+            )
+            span.set_attribute(
+                SpanAttributes.INPUT_MIME_TYPE, "application/json"
+            )
             initial = [initial]
-        span.set_attribute("flow.input_count", len(initial))
-        # Record input variables for observability
-        if initial:
-            import json
-            try:
-                input_vars = {
-                    k: v for msg in initial for k, v in msg.variables.items()
-                }
-                span.set_attribute(
-                    SpanAttributes.INPUT_VALUE,
-                    json.dumps(input_vars, default=str),
-                )
-                span.set_attribute(
-                    SpanAttributes.INPUT_MIME_TYPE, "application/json"
-                )
-            except Exception:
-                # If serialization fails, skip it
-                pass
+        if isinstance(initial, list):
+            span.set_attribute("flow.input_count", len(initial))
-        async def initial_stream():
-            for message in initial:
-                yield message
+            # convert to async iterator
+            async def list_stream():
+                for message in initial:
+                    yield message
-        current_stream = initial_stream()
+            current_stream = list_stream()
+        elif isinstance(initial, AsyncIterator):
+            # We can't know the count ahead of time
+            current_stream = initial
+        else:
+            raise ValueError(
+                "Initial input must be a FlowMessage, list of FlowMessages, "
+                "or AsyncIterator of FlowMessages"
+            )
-        # 3. Chain executors together in the main loop
+        # 4. Chain executors together in the main loop
         for step in execution_plan:
             executor = factory.create_executor(step, exec_context, **kwargs)
             output_stream = executor.execute(
@@ -107,18 +118,19 @@ async def run_flow(
             )
             current_stream = output_stream
-        # 4. Collect the final results from the last stream
+        # 5. Collect the final results from the last stream
         final_results = [state async for state in current_stream]
+        # Close the progress bars if any
+        if progress_callback is not None:
+            progress_callback.close()
         # Record flow completion metrics
         span.set_attribute("flow.output_count", len(final_results))
         error_count = sum(1 for msg in final_results if msg.is_failed())
         span.set_attribute("flow.error_count", error_count)
         # Record output variables for observability
-        if final_results:
-            import json
+        if len(final_results) == 1 and span.is_recording():
             try:
                 output_vars = {
                     k: v
@@ -155,6 +167,9 @@ async def run_flow(
         span.set_status(Status(StatusCode.ERROR, f"Flow failed: {e}"))
         raise
     finally:
+        # Clean up context resources if we created it
+        if kwargs.get("context") is None:
+            exec_context.cleanup()
         # Detach the context and end the span
         # Only detach if we successfully attached (span was recording)
         if token is not None:

qtype 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

qtype 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl