PyPI - palimpzest - Versions diffs - 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

palimpzest 0.5.4py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

palimpzest/__init__.py +7 -9
palimpzest/constants.py +47 -7
palimpzest/core/__init__.py +20 -26
palimpzest/core/data/dataclasses.py +9 -2
palimpzest/core/data/datareaders.py +497 -0
palimpzest/core/elements/records.py +29 -37
palimpzest/core/lib/fields.py +14 -12
palimpzest/core/lib/schemas.py +80 -94
palimpzest/policy.py +58 -0
palimpzest/prompts/__init__.py +22 -0
palimpzest/prompts/code_synthesis_prompts.py +28 -0
palimpzest/prompts/convert_prompts.py +87 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
palimpzest/prompts/filter_prompts.py +69 -0
palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
palimpzest/prompts/prompt_factory.py +732 -0
palimpzest/prompts/util_phrases.py +14 -0
palimpzest/query/execution/execution_strategy.py +0 -3
palimpzest/query/execution/parallel_execution_strategy.py +12 -25
palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
palimpzest/query/generators/generators.py +71 -347
palimpzest/query/operators/__init__.py +5 -5
palimpzest/query/operators/aggregate.py +10 -5
palimpzest/query/operators/code_synthesis_convert.py +4 -48
palimpzest/query/operators/convert.py +5 -2
palimpzest/query/operators/critique_and_refine_convert.py +112 -0
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/logical.py +28 -27
palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
palimpzest/query/operators/physical.py +32 -20
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/rag_convert.py +6 -3
palimpzest/query/operators/retrieve.py +13 -31
palimpzest/query/operators/scan.py +150 -0
palimpzest/query/optimizer/__init__.py +5 -1
palimpzest/query/optimizer/cost_model.py +18 -34
palimpzest/query/optimizer/optimizer.py +40 -25
palimpzest/query/optimizer/optimizer_strategy.py +26 -0
palimpzest/query/optimizer/plan.py +2 -2
palimpzest/query/optimizer/rules.py +118 -27
palimpzest/query/processor/config.py +12 -1
palimpzest/query/processor/mab_sentinel_processor.py +125 -112
palimpzest/query/processor/nosentinel_processor.py +46 -62
palimpzest/query/processor/query_processor.py +10 -20
palimpzest/query/processor/query_processor_factory.py +12 -5
palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
palimpzest/query/processor/streaming_processor.py +11 -17
palimpzest/sets.py +170 -94
palimpzest/tools/pdfparser.py +5 -64
palimpzest/utils/datareader_helpers.py +61 -0
palimpzest/utils/field_helpers.py +69 -0
palimpzest/utils/hash_helpers.py +3 -2
palimpzest/utils/udfs.py +0 -28
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/METADATA +49 -49
palimpzest-0.6.0.dist-info/RECORD +87 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/top_level.txt +0 -1
cli/README.md +0 -156
cli/__init__.py +0 -0
cli/cli_main.py +0 -390
palimpzest/config.py +0 -89
palimpzest/core/data/datasources.py +0 -369
palimpzest/datamanager/__init__.py +0 -0
palimpzest/datamanager/datamanager.py +0 -300
palimpzest/prompts.py +0 -397
palimpzest/query/operators/datasource.py +0 -202
palimpzest-0.5.4.dist-info/RECORD +0 -83
palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/LICENSE +0 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.0.dist-info}/WHEEL +0 -0

palimpzest/prompts.py DELETED Viewed

@@ -1,397 +0,0 @@
-"""This file contains prompts used by Palimpzest
-Whenever they are called, they can be parameterized with the str.format() method using the parameter names that are in brackets.
-For now, this is an easy decoupling. In the future, we maybe want a more sophisticated approach like a PromptBuilder.
-"""
-### FORMATTING INSTRUCTIONS ###
-ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
-ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
-### REASONING INSTRUCTION FOR IMAGE PROMPTS ###
-IMAGE_REASONING_SUFFIX = """Let's think step by step in order to answer the question.
-REASONING: """
-IMAGE_ANSWER_SUFFIX = """Let's think step by step in order to answer the question.
-ANSWER: """
-### DEVELOPER / SYSTEM PROMPTS ###
-COT_BOOL_SYSTEM_PROMPT = """You are a helpful assistant whose job is to answer a TRUE / FALSE question.
-You will be presented with a context and a filter condition. Output TRUE if the context satisfies the filter condition, and FALSE otherwise.
-Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
-An example is shown below:
----
-CONTEXT:
-{{
-  "text": "The quick brown fox jumps over the lazy dog."
-}}
-INPUT FIELDS:
-- text: a short passage of text
-FILTER CONDITION: the text mentions an animal
-Let's think step by step in order to answer the question.
-REASONING: the text mentions the words "fox" and "dog" which are animals, therefore the answer is TRUE.
-ANSWER: TRUE
----
-"""
-COT_BOOL_IMAGE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to analyze input image(s) and/or text in order to answer a TRUE / FALSE question.
-You will be presented with the image(s) and a filter condition. You may also have some textual inputs. Output TRUE if the input(s) satisfy the filter condition, and FALSE otherwise.
-Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
-An example is shown below:
----
-CONTEXT:
-{{
-  "image": <bytes>,
-  "photographer": "CameraEnthusiast1"
-}}
-INPUT FIELDS:
-- image: an image of a scene
-- photographer: the photographer of the image
-FILTER CONDITION: there's an animal in this image
-<image content provided here; assume in this example the image shows a dog and a cat playing>
-Let's think step by step in order to answer the question.
-REASONING: the image shows a dog and a cat playing, both of which are animals, therefore the answer is TRUE.
-ANSWER: TRUE
----
-"""
-COT_QA_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
-You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
-You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
-{output_format_instruction} Finish your response with a newline character followed by ---
-An example is shown below:
----
-CONTEXT:
-{{
-  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
-  "birthday": "December 10, 1815"
-}}
-INPUT FIELDS:
-- text: a text passage describing a scientist
-- birthday: the scientist's birthday
-OUTPUT FIELDS:
-- name: the name of the scientist
-- birth_year: the year the scientist was born
-Let's think step by step in order to answer the question.
-REASONING: the text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815.
-ANSWER:
-{{
-  "name": "Augusta Ada King",
-  "birth_year": 1815
-}}
----
-"""
-COT_QA_IMAGE_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to analyze input image(s) and/or text in order to produce a JSON object.
-You will be presented with the image(s) and a set of output fields to generate. You may also have some textual inputs. Your task is to generate a JSON object which fills in the output fields with the correct values.
-You will be provided with a description of each output field. All of the fields in the output JSON object can be derived using information from the input(s).
-{output_format_instruction} Finish your response with a newline character followed by ---
-An example is shown below:
----
-CONTEXT:
-{{
-  "image": <bytes>,
-  "photographer": "CameraEnthusiast1"
-}}
-INPUT FIELDS:
-- image: an image of a scene
-- photographer: the photographer of the image
-OUTPUT FIELDS:
-- dog_in_image: true if a dog is in the image and false otherwise
-- person_in_image: true if a person is in the image and false otherwise
-<image content provided here; assume in this example the image shows a dog and a cat playing>
-Let's think step by step in order to answer the question.
-REASONING: The image shows a dog playing with a cat, so there is a dog in the image. There is no person in the image.
-ANSWER:
-{{
-  "dog_in_image": true,
-  "person_in_image": false
-}}
----
-"""
-COT_MOA_PROPOSER_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to produce an answer to a question.
-You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
-Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
-You will be provided with a description of each input field and each output field.
-{output_format_instruction} Finish your response with a newline character followed by ---
-An example is shown below:
----
-CONTEXT:
-{{
-  "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
-  "birthday": "December 10, 1815"
-}}
-INPUT FIELDS:
-- text: a text passage describing a scientist
-- birthday: the scientist's birthday
-OUTPUT FIELDS:
-- name: the name of the scientist
-- birth_year: the year the scientist was born
-Let's think step by step in order to answer the question.
-ANSWER: the text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815.
----
-"""
-COT_MOA_PROPOSER_IMAGE_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to analyze input image(s) and/or text in order to produce an answer to a question.
-You will be presented with the image(s) and a set of output fields to generate. You may also have some textual inputs. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
-Be sure to cite information from the input(s) as evidence of why your answers are correct. Do not hallucinate evidence.
-You will be provided with a description of each input field and each output field.
-{output_format_instruction} Finish your response with a newline character followed by ---
-An example is shown below:
----
-CONTEXT:
-{{
-  "image": <bytes>,
-  "photographer": "CameraEnthusiast1"
-}}
-INPUT FIELDS:
-- image: an image of a scene
-- photographer: the photographer of the image
-OUTPUT FIELDS:
-- dog_in_image: true if a dog is in the image and false otherwise
-- person_in_image: true if a person is in the image and false otherwise
-<image content provided here; assume in this example the image shows a dog and a cat playing>
-Let's think step by step in order to answer the question.
-ANSWER: The image shows a dog playing with a cat, so there is a dog in the image. There is no person in the image.
----
-"""
-COT_MOA_AGG_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
-You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
-It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased or incorrect.
-You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the model responses.
-{output_format_instruction} Finish your response with a newline character followed by ---
-An example is shown below:
----
-MODEL RESPONSE 1: the text mentions the scientist's full name "Augusta Ada King, Countess of Lovelace" and states she was an English mathematician who worked on Babbage's Analytical Engine.
-MODEL RESPONSE 2: the text passage mentions the scientist's name as "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace" and the scientist's birthday as "December 10, 1815". Therefore, the name of the scientist is "Augusta Ada King" and the birth year is 1815.
-INPUT FIELDS:
-- text: a text passage describing a scientist
-- birthday: the scientist's birthday
-OUTPUT FIELDS:
-- name: the name of the scientist
-- birth_year: the year the scientist was born
-Let's think step by step in order to answer the question.
-REASONING: Looking at both model responses, they agree that the scientist's formal name is "Augusta Ada King". Model Response 2 correctly extracts the birth year from the birthday field as 1815. The responses are consistent and provide sufficient evidence for these values.
-ANSWER:
-{{
-  "name": "Augusta Ada King",
-  "birth_year": 1815
-}}
----
-"""
-### USER / INSTANCE-SPECIFIC PROMPTS ###
-COT_BOOL_USER_PROMPT = """You are a helpful assistant whose job is to answer a TRUE / FALSE question.
-You will be presented with a context and a filter condition. Output TRUE if the context satisfies the filter condition, and FALSE otherwise.
-Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
----
-CONTEXT:
-{context}
-INPUT FIELDS:
-{input_fields_desc}
-FILTER CONDITION: {filter_condition}
-Let's think step by step in order to answer the question.
-REASONING: """
-COT_BOOL_IMAGE_USER_PROMPT = """You are a helpful assistant whose job is to analyze input image(s) and/or text in order to answer a TRUE / FALSE question.
-You will be presented with the image(s) and a filter condition. You may also have some textual inputs. Output TRUE if the input(s) satisfy the filter condition, and FALSE otherwise.
-Remember, your answer must be TRUE or FALSE. Finish your response with a newline character followed by ---
----
-CONTEXT:
-{context}
-INPUT FIELDS:
-{input_fields_desc}
-FILTER CONDITION: {filter_condition}
-"""
-COT_QA_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
-You will be presented with a context and a set of output fields to generate. Your task is to generate a JSON object which fills in the output fields with the correct values.
-You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the context.
-{output_format_instruction} Finish your response with a newline character followed by ---
----
-CONTEXT:
-{context}
-INPUT FIELDS:
-{input_fields_desc}
-OUTPUT FIELDS:
-{output_fields_desc}
-Let's think step by step in order to answer the question.
-REASONING: """
-COT_QA_IMAGE_BASE_USER_PROMPT = """You are a helpful assistant whose job is to analyze input image(s) and/or text in order to produce a JSON object.
-You will be presented with the image(s) and a set of output fields to generate. You may also have some textual inputs. Your task is to generate a JSON object which fills in the output fields with the correct values.
-You will be provided with a description of each output field. All of the fields in the output JSON object can be derived using information from the input(s).
-{output_format_instruction} Finish your response with a newline character followed by ---
----
-CONTEXT:
-{context}
-INPUT FIELDS:
-{input_fields_desc}
-OUTPUT FIELDS:
-{output_fields_desc}
-"""
-COT_MOA_PROPOSER_BASE_USER_PROMPT = """You are a helpful assistant whose job is to produce an answer to a question.
-You will be presented with a context and a set of output fields to generate. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
-Be sure to cite information from the context as evidence of why your answers are correct. Do not hallucinate evidence.
-You will be provided with a description of each input field and each output field.
-{output_format_instruction} Finish your response with a newline character followed by ---
----
-CONTEXT:
-{context}
-INPUT FIELDS:
-{input_fields_desc}
-OUTPUT FIELDS:
-{output_fields_desc}
-Let's think step by step in order to answer the question.
-REASONING: """
-COT_MOA_PROPOSER_IMAGE_BASE_USER_PROMPT = """You are a helpful assistant whose job is to analyze input image(s) and/or text in order to produce an answer to a question.
-You will be presented with the image(s) and a set of output fields to generate. You may also have some textual inputs. Your task is to generate a paragraph or two which describes what you believe is the correct value for each output field.
-Be sure to cite information from the input(s) as evidence of why your answers are correct. Do not hallucinate evidence.
-You will be provided with a description of each input field and each output field.
-{output_format_instruction} Finish your response with a newline character followed by ---
----
-CONTEXT:
-{context}
-INPUT FIELDS:
-{input_fields_desc}
-OUTPUT FIELDS:
-{output_fields_desc}
-"""
-COT_MOA_AGG_BASE_USER_PROMPT = """You are a helpful assistant whose job is to generate a JSON object.
-You will be presented with one or more outputs produced by a set of models. Your task is to synthesize these responses into a single, high-quality JSON object which fills in the output fields with the correct values.
-It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased or incorrect.
-You will be provided with a description of each input field and each output field. All of the fields in the output JSON object can be derived using information from the model responses.
-{output_format_instruction} Finish your response with a newline character followed by ---
----
-{model_responses}
-INPUT FIELDS:
-{input_fields_desc}
-OUTPUT FIELDS:
-{output_fields_desc}
-Let's think step by step in order to answer the question.
-REASONING: """
-### CODE SYNTHESIS PROMPTS ###
-EXAMPLE_PROMPT = """Example{idx}:
-Example Input
--------------
-{example_inputs}
-Example Output
---------------
-{example_output}
-"""
-CODEGEN_PROMPT = """You are a helpful programming assistant and an expert {language} programmer. Implement the {language} function `{api}` that extracts `{output}` ({output_desc}) from given inputs:
-{inputs_desc}
-{examples_desc}
-Notice that the evaluation will severely punish incorrect outputs. Thus, when the function is uncertain, it should return `None` to abstain instead of returning an incorrect guess.
-{advice}
-Return the implementation only."""
-ADVICEGEN_PROMPT = """You are a helpful programming assistant and an expert {language} programmer. Your job is to provide programming ideas to help me write {language} programs.
-For example, if I want to complete a task: "extract the salary number (in USD) from a given employee's document", you can provide me with {n} different ways to do it like:
-Idea 1: Use regular expressions to extract the salary number: a number with a dollar sign in front of it. For example, $100,000.
-Idea 2: Find the table entry with the salary number.
-Idea 3: Use a pre-trained NLP model to extract the salary number.
-#
-Now, consider the following {language} programming task that extracts `{output}` ({output_desc}) from given inputs:
-{examples_desc}
-Please provide me with {n} different ideas to complete this task. Return the ideas only, following the format above.
-"""

palimpzest/query/operators/datasource.py DELETED Viewed

@@ -1,202 +0,0 @@
-from __future__ import annotations
-import time
-from abc import ABC, abstractmethod
-from palimpzest.constants import (
-    LOCAL_SCAN_TIME_PER_KB,
-    MEMORY_SCAN_TIME_PER_KB,
-    Cardinality,
-)
-from palimpzest.core.data.dataclasses import OperatorCostEstimates, RecordOpStats
-from palimpzest.core.elements.records import DataRecord, DataRecordSet
-from palimpzest.query.operators.physical import PhysicalOperator
-class DataSourcePhysicalOp(PhysicalOperator, ABC):
-    """
-    Physical operators which implement DataSources require slightly more information
-    in order to accurately compute naive cost estimates. Thus, we use a slightly
-    modified abstract base class for these operators.
-    """
-    def __init__(self, dataset_id: str, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.dataset_id = dataset_id
-    def __str__(self):
-        op = f"{self.op_name()}({self.dataset_id}) -> {self.output_schema}\n"
-        op += f"    ({', '.join(self.output_schema.field_names())[:30]})\n"
-        return op
-    def get_id_params(self):
-        id_params = super().get_id_params()
-        return {"dataset_id": self.dataset_id, **id_params}
-    def get_op_params(self):
-        op_params = super().get_op_params()
-        return {"dataset_id": self.dataset_id, **op_params}
-    def naive_cost_estimates(
-        self,
-        source_op_cost_estimates: OperatorCostEstimates,
-        input_cardinality: Cardinality,
-        input_record_size_in_bytes: int | float,
-    ) -> OperatorCostEstimates:
-        """
-        This function returns a naive estimate of this operator's:
-        - cardinality
-        - time_per_record
-        - cost_per_record
-        - quality
-        For the implemented operator. These will be used by the CostModel
-        when PZ does not have sample execution data -- and it will be necessary
-        in some cases even when sample execution data is present. (For example,
-        the cardinality of each operator cannot be estimated based on sample
-        execution data alone -- thus DataSourcePhysicalOps need to give
-        at least ballpark correct estimates of this quantity).
-        """
-        raise NotImplementedError("Abstract method")
-    @abstractmethod
-    def get_datasource(self):
-        raise NotImplementedError("Abstract method")
-class MarshalAndScanDataOp(DataSourcePhysicalOp):
-    def naive_cost_estimates(
-        self,
-        source_op_cost_estimates: OperatorCostEstimates,
-        input_record_size_in_bytes: int | float,
-        dataset_type: str,
-    ) -> OperatorCostEstimates:
-        # get inputs needed for naive cost estimation
-        # TODO: we should rename cardinality --> "multiplier" or "selectivity" one-to-one / one-to-many
-        # estimate time spent reading each record
-        per_record_size_kb = input_record_size_in_bytes / 1024.0
-        time_per_record = (
-            LOCAL_SCAN_TIME_PER_KB * per_record_size_kb
-            if dataset_type in ["dir", "file"]
-            else MEMORY_SCAN_TIME_PER_KB * per_record_size_kb
-        )
-        # estimate output cardinality
-        cardinality = source_op_cost_estimates.cardinality
-        # for now, assume no cost per record for reading data
-        return OperatorCostEstimates(
-            cardinality=cardinality,
-            time_per_record=time_per_record,
-            cost_per_record=0,
-            quality=1.0,
-        )
-    def __call__(self, candidate: DataRecord) -> DataRecordSet:
-        """
-        This function takes the candidate -- which is a DataRecord with a SourceRecord schema --
-        and invokes its get_item_fn on the given idx to return the next DataRecord from the DataSource.
-        """
-        start_time = time.time()
-        records = candidate.get_item_fn(candidate.idx)
-        end_time = time.time()
-        # if records is a DataRecord (instead of a list) wrap it in a list
-        if isinstance(records, DataRecord):
-            records = [records]
-        # assert that every element of records is a DataRecord and has a source_id
-        for dr in records:
-            assert isinstance(dr, DataRecord), "Output from DataSource.get_item() must be a DataRecord or List[DataRecord]"
-        # create RecordOpStats objects
-        record_op_stats_lst = []
-        for record in records:
-            record_op_stats = RecordOpStats(
-                record_id=record.id,
-                record_parent_id=record.parent_id,
-                record_source_id=record.source_id,
-                record_state=record.to_dict(include_bytes=False),
-                op_id=self.get_op_id(),
-                logical_op_id=self.logical_op_id,
-                op_name=self.op_name(),
-                time_per_record=(end_time - start_time) / len(records),
-                cost_per_record=0.0,
-                op_details={k: str(v) for k, v in self.get_id_params().items()},
-            )
-            record_op_stats_lst.append(record_op_stats)
-        # construct and return DataRecordSet object
-        record_set = DataRecordSet(records, record_op_stats_lst)
-        return record_set
-    def get_datasource(self):
-        return self.datadir.get_registered_dataset(self.dataset_id)
-    def get_datasource_type(self):
-        return self.datadir.get_registered_dataset_type(self.dataset_id)
-class CacheScanDataOp(DataSourcePhysicalOp):
-    def naive_cost_estimates(
-        self,
-        source_op_cost_estimates: OperatorCostEstimates,
-        input_record_size_in_bytes: int | float,
-    ):
-        # get inputs needed for naive cost estimation
-        # TODO: we should rename cardinality --> "multiplier" or "selectivity" one-to-one / one-to-many
-        # estimate time spent reading each record
-        per_record_size_kb = input_record_size_in_bytes / 1024.0
-        time_per_record = LOCAL_SCAN_TIME_PER_KB * per_record_size_kb
-        # estimate output cardinality
-        cardinality = source_op_cost_estimates.cardinality
-        # for now, assume no cost per record for reading from cache
-        return OperatorCostEstimates(
-            cardinality=cardinality,
-            time_per_record=time_per_record,
-            cost_per_record=0,
-            quality=1.0,
-        )
-    def __call__(self, candidate: DataRecord) -> DataRecordSet:
-        start_time = time.time()
-        records = candidate.get_item_fn(candidate.idx)
-        end_time = time.time()
-        # if records is a DataRecord (instead of a list) wrap it in a list
-        if isinstance(records, DataRecord):
-            records = [records]
-        # assert that every element of records is a DataRecord and has a source_id
-        for dr in records:
-            assert isinstance(dr, DataRecord), "Output from DataSource.get_item() must be a DataRecord or List[DataRecord]"
-        # create RecordOpStats objects
-        record_op_stats_lst = []
-        for record in records:
-            record_op_stats = RecordOpStats(
-                record_id=record.id,
-                record_parent_id=record.parent_id,
-                record_source_id=record.source_id,
-                record_state=record.to_dict(include_bytes=False),
-                op_id=self.get_op_id(),
-                logical_op_id=self.logical_op_id,
-                op_name=self.op_name(),
-                time_per_record=(end_time - start_time) / len(records),
-                cost_per_record=0.0,
-                op_details={k: str(v) for k, v in self.get_id_params().items()},
-            )
-            record_op_stats_lst.append(record_op_stats)
-        # construct and eturn DataRecordSet object
-        record_set = DataRecordSet(records, record_op_stats_lst)
-        return record_set
-    def get_datasource(self):
-        return self.datadir.get_cached_result(self.dataset_id)

palimpzest-0.5.4.dist-info/RECORD DELETED Viewed

@@ -1,83 +0,0 @@
-cli/README.md,sha256=8_oUl_yl_BIwI8hgEwYapuTTweQh9RlJ2ADbDFSfuTg,5690
-cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-cli/cli_main.py,sha256=D1rKqA89qUx16IR-k65a1KnCCQeynWrNgXItRHhXKWA,11333
-palimpzest/__init__.py,sha256=L59dJSHD2m9-JaPzRPENfNsFQ34bvakaJEwzlTK8d2c,744
-palimpzest/config.py,sha256=3rwtnfySDpFc4jD3__08QMd5Qb2aZNtYALT6GXtzkdo,3160
-palimpzest/constants.py,sha256=NEVlHCKNfWslJXUMW90tAVZSO9oazrD0vJer2z39a6w,12633
-palimpzest/policy.py,sha256=nqUU9CaciLazuJyQjC7r0HAn9KO2kuKkjIQ1qYyQRT4,10223
-palimpzest/prompts.py,sha256=gov2ul-ggQ_8KsYXDsJpOTAhGHo9Zqw3TF4CXVZMsfI,16797
-palimpzest/sets.py,sha256=a_1igrB9Gs_AM2rA131Swf2iy5nH2bbNjKtYM7afXRA,10969
-palimpzest/core/__init__.py,sha256=Onhh-PvzmPIkEHvmfyV3YbIMcTuxs6wTeZDkgdATA1c,1598
-palimpzest/core/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/core/data/dataclasses.py,sha256=45KKLbgz6p8RBm_vGCBFVVoQQhavb7iE0-gufehOj8Y,19210
-palimpzest/core/data/datasources.py,sha256=R0rfaunjU-nTilbtUw6vlvPwIhPcimFsutcjzCJFs7c,12398
-palimpzest/core/elements/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/core/elements/filters.py,sha256=zBBYRUnPfEYb85IfC04TJkS45GxgL5KfXASIHWwlwow,1554
-palimpzest/core/elements/groupbysig.py,sha256=1qHuR2-fcW-E4rxPSieYGSXZYwvFaPwf1ld9VPWvWjw,2233
-palimpzest/core/elements/records.py,sha256=Ei3d-eAjnkNT8Ne1u2N46Iv8MXdyg2n2MxZ2cgyMGiM,14143
-palimpzest/core/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/core/lib/fields.py,sha256=kfL1bG1ic8dJFWX6oYbvih7c24hb0lN_sl74wkWVKeU,3963
-palimpzest/core/lib/schemas.py,sha256=Nz5DBaCk6toZmHL1xNi1R8KiyPt4lnQ2SnrM9iqb9eU,17732
-palimpzest/datamanager/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/datamanager/datamanager.py,sha256=H01K0DDC-5aqnLmTZoAYDQcNvPJLAld5jCccT6hRD7c,12080
-palimpzest/query/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/query/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/query/execution/execution_strategy.py,sha256=twU6lToTv58ln7GolVD5Ale6cLIf5-LgT9mzb1lSaVA,2590
-palimpzest/query/execution/parallel_execution_strategy.py,sha256=iyZnTpAmTAwSiRgPGqvhpQObWAAyBYnxA9XKHUPpKOc,12037
-palimpzest/query/execution/single_threaded_execution_strategy.py,sha256=-tHmb7EVjFb6TH7KNZ77JXjzSjibkLdBSXijscRvXcQ,14581
-palimpzest/query/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/query/generators/generators.py,sha256=z0JOJq1TBy_QDcX0lx1_-Da9frx4wQivIjDLiAKWe3Q,32620
-palimpzest/query/operators/__init__.py,sha256=HKKo91j-Jcu0u058R5FlnYmAIKrfwkn9JL0lbB1Gigg,3327
-palimpzest/query/operators/aggregate.py,sha256=_vEbDGiMi_Kujjw-ORCKRHfpRvvG5_kwotaciTOJ714,9942
-palimpzest/query/operators/code_synthesis_convert.py,sha256=NOlEn_dJygdUm4w3tuus3rRPgZuebCpBdKnOEc6DVeo,23512
-palimpzest/query/operators/convert.py,sha256=BdrJjMHHmdzabxON2QVWqsSExChdUt5lqSCO5pCx6fA,19828
-palimpzest/query/operators/datasource.py,sha256=k2jr2uI6OIZSuIhmaTXmxcAn_tcB4aGwAJSd8aht2hs,7767
-palimpzest/query/operators/filter.py,sha256=93Jq46yKsMqucYRlIVymB3TNZnChhDhRyY9_j4uYTpo,10370
-palimpzest/query/operators/limit.py,sha256=NnxplYen1bgax_0DbEy86IR2dRwnfjL3GkkcIr7CO2g,2101
-palimpzest/query/operators/logical.py,sha256=izKiBKfhoI-CJdKQoqLrU0ahER-W4bU_MZiOVrTCZA4,14649
-palimpzest/query/operators/mixture_of_agents_convert.py,sha256=gpX05W07wVGaX2ZIqA0Z9qUCJy9QdHhCwkRnXhSJ3pE,6561
-palimpzest/query/operators/physical.py,sha256=q8N52EUeukrNcNAccB78x33lzU9b2-jiODB_B-9ABak,7947
-palimpzest/query/operators/project.py,sha256=B0cuV307uf0ApV7ToCxnBu91eOI_FQweT7sr-9A9ftw,2082
-palimpzest/query/operators/rag_convert.py,sha256=9bTmAczk9AnqehEq98NI2gxjBwgu0dRSukP6vpaIbiY,8365
-palimpzest/query/operators/retrieve.py,sha256=4I0asx0eO90q5zfi1sudC0cDnyOLTv-nlV8Y4ZfBF3o,4989
-palimpzest/query/operators/token_reduction_convert.py,sha256=yy9GYMPt-LQxPdwIgVyhCb9hi_8FRorGU8XqK_3jq9g,8513
-palimpzest/query/optimizer/__init__.py,sha256=2af1v_3Dyyi5Y1la4cY2eyyKsjmRU6Lfoxeqf4U92Fo,2981
-palimpzest/query/optimizer/cost_model.py,sha256=joUjt5oeA86dq_cRqp_hbPm2G-yNpSz5Jps000YbTbc,45518
-palimpzest/query/optimizer/optimizer.py,sha256=aGwDy9W4Fv0uoD-i2KZ9erfe7Yv811PyyTWBdKM09s4,19722
-palimpzest/query/optimizer/optimizer_strategy.py,sha256=71HbRWLbliAw2eV6PnqqkbPiLx7iqKFe2e23V3g-P_E,11874
-palimpzest/query/optimizer/plan.py,sha256=VfHfHRE2vuNl-hT2Bbdwb_iVW_CVOVPBG7fLOk31VIU,5786
-palimpzest/query/optimizer/primitives.py,sha256=ikaX8YcDM3IrxKt98OX-mYujRYQtdMlDgsFKyjchMMA,4061
-palimpzest/query/optimizer/rules.py,sha256=vfy3oivZvFMxu0nLK0etkX9DYXfaxug0hRpAL67iE0E,39427
-palimpzest/query/optimizer/tasks.py,sha256=ORyPpAbbVAUjkxh3WyDYw2I8Z6RfQLUsLGOh5987zTI,28058
-palimpzest/query/processor/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/query/processor/config.py,sha256=YWxRW__n17gwtaorFXbxuRUmcpTowmdHL_mQFh96YnM,2751
-palimpzest/query/processor/mab_sentinel_processor.py,sha256=6qDD0A6viKRknYN215DcQlwyt40NrXXrUtDRfNaeR58,45673
-palimpzest/query/processor/nosentinel_processor.py,sha256=jQuvAbfXVASb561OZeDS545fr_-eHg8qODK8UErk3s0,27713
-palimpzest/query/processor/query_processor.py,sha256=1bM6KhGsOPEPgsx33blmh3NVGheuwndRSk6TRqqSjYA,11558
-palimpzest/query/processor/query_processor_factory.py,sha256=2GiOYnG4OGsaLAIipEYVK0ptejp7CKPXN3bfCDSjJbI,7739
-palimpzest/query/processor/random_sampling_sentinel_processor.py,sha256=c6V1CV7FhhFjgleXR19bieNCX6NM0qvxYPla01rOrwY,29975
-palimpzest/query/processor/streaming_processor.py,sha256=LCX4FW3trgk-LX89x-5yxKNZO-Qk8AXV3PJBvlPS92w,6933
-palimpzest/schemabuilder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/schemabuilder/schema_builder.py,sha256=kGEv-Adba-FNziRrlG0zwx317IuD7rmzNl2GecvnbDw,8528
-palimpzest/tools/README.md,sha256=56_6LPG80uc0CLVhTBP6I1wgIffNv9cyTr0TmVZqmrM,483
-palimpzest/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/tools/allenpdf.py,sha256=fXMOmSDdSSLXDKAPYYJ8k4egtWEBf_Me9Lq9tM3iyoA,1690
-palimpzest/tools/pdfparser.py,sha256=Nwb9mBr9t0CbpNa0ohdBLkNF4BMcwKTUEJvc8gSm-0A,12372
-palimpzest/tools/skema_tools.py,sha256=HXUFpjMhbVxZwKKkATeK-FwtlTCawaCbeP-uHntI1Kg,669
-palimpzest/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-palimpzest/utils/demo_helpers.py,sha256=BcNgtTz4O9iGriefy6f26BtJd_G5SQPzD3oQg_qLUdU,2522
-palimpzest/utils/env_helpers.py,sha256=n81KzoJ459pRxo7QmJA7duazwWsfoMGTHc71D2LatFk,334
-palimpzest/utils/generation_helpers.py,sha256=jveE9iQQtUQ94nuU6c1zuWoQMkwizr037S8si4n35jo,3230
-palimpzest/utils/hash_helpers.py,sha256=FMvClHCVhJWKI-OAyeUI4kgT57VE9ROSgGw7tCQ4Y3c,283
-palimpzest/utils/index_helpers.py,sha256=7webOjV2vYF7UJ_YsNdoX5OyR1zJ6lSLWO1mQSGWz0Q,123
-palimpzest/utils/model_helpers.py,sha256=dZdMkZ6zOBqG3uBCkmzXG1yQAoaGL3wF6lNSgnvigEQ,2399
-palimpzest/utils/progress.py,sha256=GYmPUBdG7xmqbqj1UiSNP-pWZKmRMLX797MBgrOPugM,7214
-palimpzest/utils/sandbox.py,sha256=Ge96gmzqeOGlNkMCG9A95_PB8wRQbvTFua136of8FcA,6465
-palimpzest/utils/token_reduction_helpers.py,sha256=Ob95PcqCsbGLiBdQ-4YQsWGWRppb2hvQyt0gi1fzL-Y,3855
-palimpzest/utils/udfs.py,sha256=V1SGxXjkrdFmv5vE8vZhqFcAn8fvWUHBLQmV10VZ62A,2848
-palimpzest-0.5.4.dist-info/LICENSE,sha256=5GUlHy9lr-Py9kvV38FF1m3yy3NqM18fefuE9wkWumo,1079
-palimpzest-0.5.4.dist-info/METADATA,sha256=GSQpHcSooQvVqzvuL_IsROjdjDW4k5OxMfw2q6h-5F4,7336
-palimpzest-0.5.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-palimpzest-0.5.4.dist-info/entry_points.txt,sha256=YyHoEmjhWcXfblAvTWeWj6cJHWPS20ZMGvf7-tVQDoQ,41
-palimpzest-0.5.4.dist-info/top_level.txt,sha256=cVbFwaCyGJfBS9RpFQ7_g3wEgQ1BzLDCymEEDAYWC0g,15
-palimpzest-0.5.4.dist-info/RECORD,,

palimpzest 0.5.4__py3-none-any.whl → 0.6.0__py3-none-any.whl

palimpzest 0.5.4py3-none-any.whl → 0.6.0py3-none-any.whl