PyPI - palimpzest - Versions diffs - 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.20.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/search.py ADDED Viewed

@@ -0,0 +1,524 @@
+import functools
+import inspect
+import os
+import time
+from typing import Any
+# from mem0 import Memory
+from smolagents import CodeAgent, LiteLLMModel, tool
+# from palimpzest.agents.search_agents import DataDiscoveryAgent, SearchManagerAgent
+from palimpzest.core.data.context import Context
+from palimpzest.core.data.context_manager import ContextManager
+from palimpzest.core.elements.records import DataRecord, DataRecordSet
+from palimpzest.core.models import GenerationStats, OperatorCostEstimates, RecordOpStats
+from palimpzest.query.operators.physical import PhysicalOperator
+def make_tool(bound_method):
+    # Get the original function and bound instance
+    func = bound_method.__func__
+    instance = bound_method.__self__
+    # Get the signature and remove 'self'
+    sig = inspect.signature(func)
+    params = list(sig.parameters.values())[1:]  # skip 'self'
+    new_sig = inspect.Signature(parameters=params, return_annotation=sig.return_annotation)
+    # Create a wrapper function dynamically
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        return func(instance, *args, **kwargs)
+    # Update the __signature__ to reflect the new one without 'self'
+    wrapper.__signature__ = new_sig
+    return wrapper
+class SmolAgentsSearch(PhysicalOperator):
+    """
+    Physical operator for searching with Smol Agents.
+    """
+    def __init__(self, context_id: str, search_query: str, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.context_id = context_id
+        self.search_query = search_query
+        # self.model_id = "anthropic/claude-3-7-sonnet-latest"
+        self.model_id = "openai/gpt-4o-mini-2024-07-18"
+        # self.model_id = "openai/gpt-4o-2024-08-06"
+        api_key = os.getenv("ANTHROPIC_API_KEY") if "anthropic" in self.model_id else os.getenv("OPENAI_API_KEY")
+        self.model = LiteLLMModel(model_id=self.model_id, api_key=api_key)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Context ID: {self.context_id:20s}\n"
+        op += f"    Search Query: {self.search_query:20s}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {
+            "context_id": self.context_id,
+            "search_query": self.search_query,
+            **id_params,
+        }
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {
+            "context_id": self.context_id,
+            "search_query": self.search_query,
+            **op_params,
+        }
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        return OperatorCostEstimates(
+            cardinality=source_op_cost_estimates.cardinality,
+            time_per_record=100,
+            cost_per_record=1,
+            quality=1.0,
+        )
+    def _create_record_set(
+        self,
+        candidate: DataRecord,
+        generation_stats: GenerationStats,
+        total_time: float,
+        answer: dict[str, Any],
+    ) -> DataRecordSet:
+        """
+        Given an input DataRecord and a determination of whether it passed the filter or not,
+        construct the resulting RecordSet.
+        """
+        # create new DataRecord and set passed_operator attribute
+        dr = DataRecord.from_parent(self.output_schema, parent_record=candidate)
+        for field in self.output_schema.model_fields:
+            if field in answer:
+                dr[field] = answer[field]
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=total_time,
+            cost_per_record=generation_stats.cost_per_record,
+            model_name=self.get_model_name(),
+            total_input_tokens=generation_stats.total_input_tokens,
+            total_output_tokens=generation_stats.total_output_tokens,
+            total_input_cost=generation_stats.total_input_cost,
+            total_output_cost=generation_stats.total_output_cost,
+            llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+            fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+            answer={k: v.description if isinstance(v, Context) else v for k, v in answer.items()},
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])
+    def __call__(self, candidate: DataRecord) -> Any:
+        start_time = time.time()
+        # get the input context object and its tools
+        input_context: Context = candidate.context
+        description = input_context.description
+        tools = [tool(make_tool(f)) for f in input_context.tools]
+        # # construct the full search query
+        # full_query = f"Please execute the following search query. Output a **detailed** description of (1) which data you look at, and (2) what you find in that data. Avoid making overly broad statements such as \"What you're searching for is not present in the dataset\". Instead, make more precise statments like \"What you're searching for is not present in files A.txt, B.txt, and C.txt, but may be present elsewhere\".\n\nQUERY: {self.search_query}"
+        # perform the computation
+        instructions = f"\n\nHere is a description of the Context whose data you will be working with, as well as any previously computed results:\n\n{description}"
+        agent = CodeAgent(
+            tools=tools,
+            model=self.model,
+            add_base_tools=False,
+            instructions=instructions,
+            return_full_result=True,
+            additional_authorized_imports=["pandas", "io", "os"],
+        )
+        result = agent.run(self.search_query)
+        # NOTE: you can see the system prompt with `agent.memory.system_prompt.system_prompt`
+        # full_steps = agent.memory.get_full_steps()
+        # compute generation stats
+        response = result.output
+        input_tokens = result.token_usage.input_tokens
+        output_tokens = result.token_usage.output_tokens
+        cost_per_input_token = (3.0 / 1e6) if "anthropic" in self.model_id else (0.15 / 1e6) # (2.5 / 1e6) #
+        cost_per_output_token = (15.0 / 1e6) if "anthropic" in self.model_id else (0.6 / 1e6) # (10.0 / 1e6) #
+        input_cost = input_tokens * cost_per_input_token
+        output_cost = output_tokens * cost_per_output_token
+        generation_stats = GenerationStats(
+            model_name=self.model_id,
+            total_input_tokens=input_tokens,
+            total_output_tokens=output_tokens,
+            total_input_cost=input_cost,
+            total_output_cost=output_cost,
+            cost_per_record=input_cost + output_cost,
+            llm_call_duration_secs=time.time() - start_time,
+        )
+        # update the description of the Context to include the search result
+        new_description = f"RESULT: {response}\n\n"
+        cm = ContextManager()
+        cm.update_context(id=self.context_id, description=new_description)
+        # create and return record set
+        field_answers = {
+            "context": cm.get_context(id=self.context_id),
+        }
+        record_set = self._create_record_set(
+            candidate,
+            generation_stats,
+            time.time() - start_time,
+            field_answers,
+        )
+        return record_set
+# class SmolAgentsManagedSearch(PhysicalOperator):
+#     """
+#     Physical operator for searching with Smol Agents using an Orchestrator and a Data Discovery Agent.
+#     """
+#     def __init__(self, context_id: str, search_query: str, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.context_id = context_id
+#         self.search_query = search_query
+#         # self.model_id = "anthropic/claude-3-7-sonnet-latest"
+#         self.model_id = "openai/gpt-4o-mini-2024-07-18"
+#         # self.model_id = "o1"
+#         model_params = {
+#             "model_id": self.model_id,
+#             "custom_role_conversions": {"tool-call": "assistant", "tool-response": "user"},
+#             "max_completion_tokens": 8192,
+#         }
+#         if self.model_id == "o1":
+#             model_params["reasoning_effort"] = "high"
+#         self.model = LiteLLMModel(**model_params)
+#         self.text_limit = 100000
+#         self.memory = Memory()
+#     def __str__(self):
+#         op = super().__str__()
+#         op += f"    Context ID: {self.context_id:20s}\n"
+#         op += f"    Search Query: {self.search_query:20s}\n"
+#         return op
+#     def get_id_params(self):
+#         id_params = super().get_id_params()
+#         return {
+#             "context_id": self.context_id,
+#             "search_query": self.search_query,
+#             **id_params,
+#         }
+#     def get_op_params(self):
+#         op_params = super().get_op_params()
+#         return {
+#             "context_id": self.context_id,
+#             "search_query": self.search_query,
+#             **op_params,
+#         }
+#     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+#         return OperatorCostEstimates(
+#             cardinality=source_op_cost_estimates.cardinality,
+#             time_per_record=100,
+#             cost_per_record=1,
+#             quality=1.0,
+#         )
+#     def _create_record_set(
+#         self,
+#         candidate: DataRecord,
+#         generation_stats: GenerationStats,
+#         total_time: float,
+#         answer: dict[str, Any],
+#     ) -> DataRecordSet:
+#         """
+#         Given an input DataRecord and a determination of whether it passed the filter or not,
+#         construct the resulting RecordSet.
+#         """
+#         # create new DataRecord and set passed_operator attribute
+#         dr = DataRecord.from_parent(self.output_schema, parent_record=candidate)
+#         for field in self.output_schema.model_fields:
+#             if field in answer:
+#                 dr[field] = answer[field]
+        # # create RecordOpStats object
+        # record_op_stats = RecordOpStats(
+        #     record_id=dr.id,
+        #     record_parent_ids=dr.parent_ids,
+        #     record_source_indices=dr.source_indices,
+        #     record_state=dr.to_dict(include_bytes=False),
+        #     full_op_id=self.get_full_op_id(),
+        #     logical_op_id=self.logical_op_id,
+        #     op_name=self.op_name(),
+        #     time_per_record=total_time,
+        #     cost_per_record=generation_stats.cost_per_record,
+        #     model_name=self.get_model_name(),
+        #     total_input_tokens=generation_stats.total_input_tokens,
+        #     total_output_tokens=generation_stats.total_output_tokens,
+        #     total_input_cost=generation_stats.total_input_cost,
+        #     total_output_cost=generation_stats.total_output_cost,
+        #     llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+        #     fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+        #     total_llm_calls=generation_stats.total_llm_calls,
+        #     total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+        #     answer={k: v.description if isinstance(v, Context) else v for k, v in answer.items()},
+        #     op_details={k: str(v) for k, v in self.get_id_params().items()},
+        # )
+#         return DataRecordSet([dr], [record_op_stats])
+#     def __call__(self, candidate: DataRecord) -> Any:
+#         start_time = time.time()
+#         # get the input context object and its tools
+#         input_context: Context = candidate.context
+#         description = input_context.description
+#         tools = [tool(make_tool(f)) for f in input_context.tools]
+#         # create a memory tool for accessing past searches
+#         @tool
+#         def tool_search_history(query: str) -> str:
+#             """
+#             This tool enables the agent to search through its history of execution in previous sessions.
+#             Thus, the agent can learn more about what it has done in the past by invoking this tool with
+#             a query describing what past interactions the agent might be curious about.
+#             Args:
+#                 query (str): A description of what the agent wishes to search for in its execution history.
+#             Returns:
+#                 str: A summary of the agent execution history which is relevant to the query.
+#             """
+#             memories = self.memory.search(query=query, user_id="data_discovery_agent")
+#             memory_str = ""
+#             for idx, memory in enumerate(memories):
+#                 memory_str += f"MEMORY {idx+1}: {memory['memory']}"
+#             return memory_str
+#         # tools.append(tool_search_history)
+#         data_discovery_agent = CodeAgent(
+#             model=self.model,
+#             tools=tools,
+#             max_steps=20,
+#             verbosity_level=2,
+#             planning_interval=4,
+#             name="data_discovery_agent",
+#             description="""A team member that will search a data repository to find files which help to answer your question.
+#         Ask him for all your questions that require searching a repository of relevant data.
+#         Provide him as much context as possible, in particular if you need to search on a specific timeframe!
+#         And don't hesitate to provide him with a complex search task, like finding a difference between two files.
+#         Your request must be a real sentence, not a keyword search! Like "Find me this information (...)" rather than a few keywords.
+#         """,
+#             provide_run_summary=True,
+#         )
+#         data_discovery_agent.prompt_templates["managed_agent"]["task"] += f"""\n\nHere is a description of the context you will be working with: {description}\n\nSearch as many files as possible before returning your final answer.\n\nAdditionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
+#         manager_agent = CodeAgent(
+#             model=self.model,
+#             tools=tools,
+#             max_steps=12,
+#             verbosity_level=2,
+#             additional_authorized_imports=["*"],
+#             planning_interval=4,
+#             managed_agents=[data_discovery_agent],
+#             return_full_result=True,
+#         )
+#         # TODO: improve context descriptions and add memory from there; expand to multi-modal benchmark(s)
+#         # perform the computation
+#         result = manager_agent.run(self.search_query)
+#         # compute generation stats
+#         response = result.output
+#         input_tokens = result.token_usage.input_tokens
+#         output_tokens = result.token_usage.output_tokens
+#         cost_per_input_token = (3.0 / 1e6) if "anthropic" in self.model_id else (0.15 / 1e6) # (15.0 / 1e6)
+#         cost_per_output_token = (15.0 / 1e6) if "anthropic" in self.model_id else (0.6 / 1e6) # (60.0 / 1e6)
+#         input_cost = input_tokens * cost_per_input_token
+#         output_cost = output_tokens * cost_per_output_token
+#         generation_stats = GenerationStats(
+#             model_name=self.model_id,
+#             total_input_tokens=input_tokens,
+#             total_output_tokens=output_tokens,
+#             total_input_cost=input_cost,
+#             total_output_cost=output_cost,
+#             cost_per_record=input_cost + output_cost,
+#             llm_call_duration_secs=time.time() - start_time,
+#         )
+#         # update the description of the Context to include the search result
+#         new_description = f"RESULT: {response}\n\n"
+#         cm = ContextManager()
+#         cm.update_context(id=self.context_id, description=new_description)
+#         # create and return record set
+#         field_answers = {
+#             "context": cm.get_context(id=self.context_id),
+#         }
+#         record_set = self._create_record_set(
+#             candidate,
+#             generation_stats,
+#             time.time() - start_time,
+#             field_answers,
+#         )
+#         return record_set
+# class SmolAgentsCustomManagedSearch(PhysicalOperator):
+#     """
+#     Physical operator for searching with Smol Agents using an Orchestrator and a Data Discovery Agent.
+#     """
+#     def __init__(self, context_id: str, search_query: str, *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.context_id = context_id
+#         self.search_query = search_query
+#         # self.model_id = "anthropic/claude-3-7-sonnet-latest"
+#         self.model_id = "openai/gpt-4o-mini-2024-07-18"
+#         # self.model_id = "o1"
+#         model_params = {
+#             "model_id": self.model_id,
+#             "custom_role_conversions": {"tool-call": "assistant", "tool-response": "user"},
+#             "max_completion_tokens": 8192,
+#         }
+#         if self.model_id == "o1":
+#             model_params["reasoning_effort"] = "high"
+#         self.model = LiteLLMModel(**model_params)
+#         self.text_limit = 100000
+#     def __str__(self):
+#         op = super().__str__()
+#         op += f"    Context ID: {self.context_id:20s}\n"
+#         op += f"    Search Query: {self.search_query:20s}\n"
+#         return op
+#     def get_id_params(self):
+#         id_params = super().get_id_params()
+#         return {
+#             "context_id": self.context_id,
+#             "search_query": self.search_query,
+#             **id_params,
+#         }
+#     def get_op_params(self):
+#         op_params = super().get_op_params()
+#         return {
+#             "context_id": self.context_id,
+#             "search_query": self.search_query,
+#             **op_params,
+#         }
+#     def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+#         return OperatorCostEstimates(
+#             cardinality=source_op_cost_estimates.cardinality,
+#             time_per_record=100,
+#             cost_per_record=1,
+#             quality=1.0,
+#         )
+#     def _create_record_set(
+#         self,
+#         candidate: DataRecord,
+#         generation_stats: GenerationStats,
+#         total_time: float,
+#         answer: dict[str, Any],
+#     ) -> DataRecordSet:
+#         """
+#         Given an input DataRecord and a determination of whether it passed the filter or not,
+#         construct the resulting RecordSet.
+#         """
+#         # create new DataRecord and set passed_operator attribute
+#         dr = DataRecord.from_parent(self.output_schema, parent_record=candidate)
+#         for field in self.output_schema.model_fields:
+#             if field in answer:
+#                 dr[field] = answer[field]
+#         # create RecordOpStats object
+#         record_op_stats = RecordOpStats(
+#             record_id=dr.id,
+#             record_parent_ids=dr.parent_ids,
+#             record_source_indices=dr.source_indices,
+#             record_state=dr.to_dict(include_bytes=False),
+#             full_op_id=self.get_full_op_id(),
+#             logical_op_id=self.logical_op_id,
+#             op_name=self.op_name(),
+#             time_per_record=total_time,
+#             cost_per_record=generation_stats.cost_per_record,
+#             model_name=self.get_model_name(),
+#             total_input_tokens=generation_stats.total_input_tokens,
+#             total_output_tokens=generation_stats.total_output_tokens,
+#             total_input_cost=generation_stats.total_input_cost,
+#             total_output_cost=generation_stats.total_output_cost,
+#             llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+#             fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+#             total_llm_calls=generation_stats.total_llm_calls,
+#             total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+#             answer={k: v.description if isinstance(v, Context) else v for k, v in answer.items()},
+#             op_details={k: str(v) for k, v in self.get_id_params().items()},
+#         )
+#         return DataRecordSet([dr], [record_op_stats])
+#     def __call__(self, candidate: DataRecord) -> Any:
+#         start_time = time.time()
+#         # get the input context object and its tools
+#         input_context: Context = candidate.context
+#         description = input_context.description
+#         tools = [tool(make_tool(f)) for f in input_context.tools]
+#         # TODO: add semantic operators to tools
+#         data_discovery_agent = DataDiscoveryAgent(self.context_id, description, model=self.model, tools=tools)
+#         search_manager_agent = SearchManagerAgent(self.context_id, description, model=self.model, tools=tools, managed_agents=[data_discovery_agent])
+#         # perform the computation
+#         result = search_manager_agent.run(self.search_query)
+#         # compute generation stats
+#         response = result.output
+#         input_tokens = result.token_usage.input_tokens
+#         output_tokens = result.token_usage.output_tokens
+#         cost_per_input_token = (3.0 / 1e6) if "anthropic" in self.model_id else (0.15 / 1e6) # (15.0 / 1e6)
+#         cost_per_output_token = (15.0 / 1e6) if "anthropic" in self.model_id else (0.6 / 1e6) # (60.0 / 1e6)
+#         input_cost = input_tokens * cost_per_input_token
+#         output_cost = output_tokens * cost_per_output_token
+#         generation_stats = GenerationStats(
+#             model_name=self.model_id,
+#             total_input_tokens=input_tokens,
+#             total_output_tokens=output_tokens,
+#             total_input_cost=input_cost,
+#             total_output_cost=output_cost,
+#             cost_per_record=input_cost + output_cost,
+#             llm_call_duration_secs=time.time() - start_time,
+#         )
+#         # update the description of the Context to include the search result
+#         new_description = f"RESULT: {response}\n\n"
+#         cm = ContextManager()
+#         cm.update_context(id=self.context_id, description=new_description)
+#         # create and return record set
+#         field_answers = {
+#             "context": cm.get_context(id=self.context_id),
+#         }
+#         record_set = self._create_record_set(
+#             candidate,
+#             generation_stats,
+#             time.time() - start_time,
+#             field_answers,
+#         )
+#         return record_set

palimpzest/query/operators/split_convert.py CHANGED Viewed

@@ -2,26 +2,28 @@ from __future__ import annotations
 import math
+from pydantic.fields import FieldInfo
 from palimpzest.constants import (
     MODEL_CARDS,
     NAIVE_EST_NUM_INPUT_TOKENS,
     NAIVE_EST_NUM_OUTPUT_TOKENS,
     PromptStrategy,
 )
-from palimpzest.core.data.dataclasses import GenerationStats, OperatorCostEstimates
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.fields import Field, StringField
-from palimpzest.query.generators.generators import generator_factory
+from palimpzest.core.models import GenerationStats, OperatorCostEstimates
+from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.convert import LLMConvert
 class SplitConvert(LLMConvert):
     def __init__(self, num_chunks: int = 2, min_size_to_chunk: int = 1000, *args, **kwargs):
+        kwargs["prompt_strategy"] = None
         super().__init__(*args, **kwargs)
         self.num_chunks = num_chunks
         self.min_size_to_chunk = min_size_to_chunk
-        self.split_generator = generator_factory(self.model, PromptStrategy.SPLIT_PROPOSER, self.cardinality, self.verbose)
-        self.split_merge_generator = generator_factory(self.model, PromptStrategy.SPLIT_MERGER, self.cardinality, self.verbose)
+        self.split_generator = Generator(self.model, PromptStrategy.SPLIT_PROPOSER, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
+        self.split_merge_generator = Generator(self.model, PromptStrategy.SPLIT_MERGER, self.reasoning_effort, self.api_base, self.cardinality, self.verbose)
         # crude adjustment factor for naive estimation in no-sentinel setting
         self.naive_quality_adjustment = 0.6
@@ -103,8 +105,8 @@ class SplitConvert(LLMConvert):
             content = candidate[field_name]
             # do not chunk this field if it is not a string or a list of strings
-            is_string_field = isinstance(field, StringField)
-            is_list_string_field = hasattr(field, "element_type") and isinstance(field.element_type, StringField)
+            is_string_field = field.annotation in [str, str | None]
+            is_list_string_field = field.annotation in [list[str], list[str] | None]
             if not (is_string_field or is_list_string_field):
                 field_name_to_chunked_content[field_name] = [content]
                 continue
@@ -136,7 +138,7 @@ class SplitConvert(LLMConvert):
         return candidates
-    def convert(self, candidate: DataRecord, fields: dict[str, Field]) -> tuple[dict[str, list], GenerationStats]:
+    def convert(self, candidate: DataRecord, fields: dict[str, FieldInfo]) -> tuple[dict[str, list], GenerationStats]:
         # get the set of input fields to use for the convert operation
         input_fields = self.get_input_fields()

palimpzest/query/optimizer/__init__.py CHANGED Viewed

@@ -1,15 +1,10 @@
+from palimpzest.query.optimizer.rules import AddContextsBeforeComputeRule as _AddContextsBeforeComputeRule
 from palimpzest.query.optimizer.rules import (
     AggregateRule as _AggregateRule,
 )
 from palimpzest.query.optimizer.rules import (
     BasicSubstitutionRule as _BasicSubstitutionRule,
 )
-from palimpzest.query.optimizer.rules import (
-    CodeSynthesisConvertRule as _CodeSynthesisConvertRule,
-)
-from palimpzest.query.optimizer.rules import (
-    CodeSynthesisConvertSingleRule as _CodeSynthesisConvertSingleRule,
-)
 from palimpzest.query.optimizer.rules import (
     CriticAndRefineConvertRule as _CriticAndRefineConvertRule,
 )
@@ -22,6 +17,9 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     LLMFilterRule as _LLMFilterRule,
 )
+from palimpzest.query.optimizer.rules import (
+    LLMJoinRule as _LLMJoinRule,
+)
 from palimpzest.query.optimizer.rules import (
     MixtureOfAgentsConvertRule as _MixtureOfAgentsConvertRule,
 )
@@ -51,14 +49,14 @@ from palimpzest.query.optimizer.rules import (
 )
 ALL_RULES = [
+    _AddContextsBeforeComputeRule,
     _AggregateRule,
     _BasicSubstitutionRule,
-    _CodeSynthesisConvertRule,
-    _CodeSynthesisConvertSingleRule,
     _CriticAndRefineConvertRule,
     _ImplementationRule,
     _LLMConvertBondedRule,
     _LLMFilterRule,
+    _LLMJoinRule,
     _MixtureOfAgentsConvertRule,
     _NonLLMConvertRule,
     _NonLLMFilterRule,
@@ -74,7 +72,7 @@ IMPLEMENTATION_RULES = [
     rule
     for rule in ALL_RULES
     if issubclass(rule, _ImplementationRule)
-    and rule not in [_CodeSynthesisConvertRule, _ImplementationRule]
+    and rule not in [_ImplementationRule]
 ]
 TRANSFORMATION_RULES = [

palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl