PyPI - palimpzest - Versions diffs - 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

palimpzest 0.5.4py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (71) hide show

palimpzest/__init__.py +7 -9
palimpzest/constants.py +47 -7
palimpzest/core/__init__.py +20 -26
palimpzest/core/data/dataclasses.py +9 -2
palimpzest/core/data/datareaders.py +497 -0
palimpzest/core/elements/records.py +29 -37
palimpzest/core/lib/fields.py +14 -12
palimpzest/core/lib/schemas.py +80 -94
palimpzest/policy.py +58 -0
palimpzest/prompts/__init__.py +22 -0
palimpzest/prompts/code_synthesis_prompts.py +28 -0
palimpzest/prompts/convert_prompts.py +87 -0
palimpzest/prompts/critique_and_refine_convert_prompts.py +216 -0
palimpzest/prompts/filter_prompts.py +69 -0
palimpzest/prompts/moa_aggregator_convert_prompts.py +57 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +79 -0
palimpzest/prompts/prompt_factory.py +732 -0
palimpzest/prompts/util_phrases.py +14 -0
palimpzest/query/execution/execution_strategy.py +0 -3
palimpzest/query/execution/parallel_execution_strategy.py +12 -25
palimpzest/query/execution/single_threaded_execution_strategy.py +31 -45
palimpzest/query/generators/generators.py +71 -347
palimpzest/query/operators/__init__.py +5 -5
palimpzest/query/operators/aggregate.py +10 -5
palimpzest/query/operators/code_synthesis_convert.py +4 -48
palimpzest/query/operators/convert.py +5 -2
palimpzest/query/operators/critique_and_refine_convert.py +112 -0
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/logical.py +28 -27
palimpzest/query/operators/mixture_of_agents_convert.py +4 -1
palimpzest/query/operators/physical.py +32 -20
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/rag_convert.py +6 -3
palimpzest/query/operators/retrieve.py +13 -31
palimpzest/query/operators/scan.py +150 -0
palimpzest/query/optimizer/__init__.py +5 -1
palimpzest/query/optimizer/cost_model.py +18 -34
palimpzest/query/optimizer/optimizer.py +40 -25
palimpzest/query/optimizer/optimizer_strategy.py +26 -0
palimpzest/query/optimizer/plan.py +2 -2
palimpzest/query/optimizer/rules.py +118 -27
palimpzest/query/processor/config.py +12 -1
palimpzest/query/processor/mab_sentinel_processor.py +125 -112
palimpzest/query/processor/nosentinel_processor.py +46 -62
palimpzest/query/processor/query_processor.py +10 -20
palimpzest/query/processor/query_processor_factory.py +12 -5
palimpzest/query/processor/random_sampling_sentinel_processor.py +112 -91
palimpzest/query/processor/streaming_processor.py +11 -17
palimpzest/sets.py +170 -94
palimpzest/tools/pdfparser.py +5 -64
palimpzest/utils/datareader_helpers.py +61 -0
palimpzest/utils/field_helpers.py +69 -0
palimpzest/utils/hash_helpers.py +3 -2
palimpzest/utils/udfs.py +0 -28
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/METADATA +49 -49
palimpzest-0.6.1.dist-info/RECORD +87 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/top_level.txt +0 -1
cli/README.md +0 -156
cli/__init__.py +0 -0
cli/cli_main.py +0 -390
palimpzest/config.py +0 -89
palimpzest/core/data/datasources.py +0 -369
palimpzest/datamanager/__init__.py +0 -0
palimpzest/datamanager/datamanager.py +0 -300
palimpzest/prompts.py +0 -397
palimpzest/query/operators/datasource.py +0 -202
palimpzest-0.5.4.dist-info/RECORD +0 -83
palimpzest-0.5.4.dist-info/entry_points.txt +0 -2
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/LICENSE +0 -0
{palimpzest-0.5.4.dist-info → palimpzest-0.6.1.dist-info}/WHEEL +0 -0

palimpzest/prompts/util_phrases.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""This file contains utility phrases which are templated into many of our prompts."""
+### FORMATTING INSTRUCTIONS ###
+ONE_TO_ONE_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON dictionary. The dictionary should only have the specified output fields."
+ONE_TO_MANY_OUTPUT_FORMAT_INSTRUCTION = "Remember, your answer must be a valid JSON list of dictionaries. The list may contain one or more dictionaries, and each dictionary should only have the specified output fields."
+### REASONING INSTRUCTION FOR IMAGE PROMPTS ###
+COT_REASONING_INSTRUCTION = """Let's think step-by-step in order to answer the question.
+REASONING: """
+COT_ANSWER_INSTRUCTION = """Let's think step-by-step in order to answer the question.
+ANSWER: """

palimpzest/query/execution/execution_strategy.py CHANGED Viewed

@@ -4,7 +4,6 @@ from enum import Enum
 from palimpzest.core.data.dataclasses import ExecutionStats, PlanStats
 from palimpzest.core.elements.records import DataRecord
-from palimpzest.datamanager.datamanager import DataDirectory
 from palimpzest.query.optimizer.plan import PhysicalPlan
@@ -23,12 +22,10 @@ class ExecutionStrategy(ABC):
     """
     def __init__(self,
                  scan_start_idx: int = 0,
-                 datadir: DataDirectory | None = None,
                  max_workers: int | None = None,
                  nocache: bool = True,
                  verbose: bool = False):
         self.scan_start_idx = scan_start_idx
-        self.datadir = datadir
         self.nocache = nocache
         self.verbose = verbose
         self.max_workers = max_workers

palimpzest/query/execution/parallel_execution_strategy.py CHANGED Viewed

@@ -4,13 +4,11 @@ from concurrent.futures import ThreadPoolExecutor, wait
 from palimpzest.constants import PARALLEL_EXECUTION_SLEEP_INTERVAL_SECS
 from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
-from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.schemas import SourceRecord
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
-from palimpzest.query.operators.datasource import DataSourcePhysicalOp
 from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
+from palimpzest.query.operators.scan import ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
@@ -72,12 +70,11 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
         }
         op_id_to_op_idx = {op.get_op_id(): idx for idx, op in enumerate(plan.operators)}
-        # get handle to DataSource and pre-compute its op_id and size
+        # get handle to scan operator and pre-compute its op_id and size
         source_operator = plan.operators[0]
-        assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp"
+        assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
         source_op_id = source_operator.get_op_id()
-        datasource = source_operator.get_datasource()
-        datasource_len = len(datasource)
+        datareader_len = len(source_operator.datareader)
         # get limit of final limit operator (if one exists)
         final_limit = plan.operators[-1].limit if isinstance(plan.operators[-1], LimitScanOp) else None
@@ -87,13 +84,7 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
         current_scan_idx = self.scan_start_idx
         with ThreadPoolExecutor(max_workers=plan_workers) as executor:
             # create initial (set of) future(s) to read first source record;
-            # construct input DataRecord for DataSourcePhysicalOp
-            # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
-            #       it is simply a vessel to inform the scan_operator which record to fetch
-            candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
-            candidate.idx = current_scan_idx
-            candidate.get_item_fn = datasource.get_item
-            futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, candidate))
+            futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, current_scan_idx))
             op_id_to_futures_in_flight[source_op_id] += 1
             current_scan_idx += 1
@@ -131,7 +122,8 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
                         # add records (which are not filtered) to the cache, if allowed
                         if not self.nocache:
-                            self.datadir.append_cache(operator.target_cache_id, record)
+                            # self.datadir.append_cache(operator.target_cache_id, record)
+                            pass
                         # add records to processing queue if there is a next_operator; otherwise add to output_records
                         next_operator = op_id_to_next_operator[op_id]
@@ -145,14 +137,8 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
                         source_records_scanned += len(record_set)
                         # scan next record if we can still draw records from source
-                        if source_records_scanned < num_samples and current_scan_idx < datasource_len:
-                            # construct input DataRecord for DataSourcePhysicalOp
-                            # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
-                            #       it is simply a vessel to inform the scan_operator which record to fetch
-                            candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
-                            candidate.idx = current_scan_idx
-                            candidate.get_item_fn = datasource.get_item
-                            new_futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, candidate))
+                        if source_records_scanned < num_samples and current_scan_idx < datareader_len:
+                            new_futures.append(executor.submit(PhysicalOperator.execute_op_wrapper, source_operator, current_scan_idx))
                             op_id_to_futures_in_flight[source_op_id] += 1
                             current_scan_idx += 1
@@ -217,8 +203,9 @@ class PipelinedParallelExecutionStrategy(ExecutionStrategy):
         # if caching was allowed, close the cache
         if not self.nocache:
-            for operator in plan.operators:
-                self.datadir.close_cache(operator.target_cache_id)
+            for _ in plan.operators:
+                # self.datadir.close_cache(operator.target_cache_id)
+                pass
         # finalize plan stats
         total_plan_time = time.time() - plan_start_time

palimpzest/query/execution/single_threaded_execution_strategy.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import time
 from palimpzest.core.data.dataclasses import OperatorStats, PlanStats
-from palimpzest.core.elements.records import DataRecord
-from palimpzest.core.lib.schemas import SourceRecord
 from palimpzest.query.execution.execution_strategy import ExecutionStrategy
 from palimpzest.query.operators.aggregate import AggregateOp
-from palimpzest.query.operators.datasource import DataSourcePhysicalOp
 from palimpzest.query.operators.filter import FilterOp
 from palimpzest.query.operators.limit import LimitScanOp
+from palimpzest.query.operators.scan import ScanPhysicalOp
 from palimpzest.query.optimizer.plan import PhysicalPlan
@@ -46,14 +44,13 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
         output_records = []
         current_scan_idx = self.scan_start_idx
-        # get handle to DataSource and pre-compute its size
+        # get handle to scan operator and pre-compute its size
         source_operator = plan.operators[0]
-        assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp"
-        datasource = source_operator.get_datasource()
-        datasource_len = len(datasource)
+        assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
+        datareader_len = len(source_operator.datareader)
         # initialize processing queues for each operation
-        processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, DataSourcePhysicalOp)}
+        processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
         # execute the plan one operator at a time
         for op_idx, operator in enumerate(plan.operators):
@@ -64,19 +61,12 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
             # initialize output records and record_op_stats for this operator
             records, record_op_stats = [], []
-            # invoke datasource operator(s) until we run out of source records or hit the num_samples limit
-            if isinstance(operator, DataSourcePhysicalOp):
+            # invoke scan operator(s) until we run out of source records or hit the num_samples limit
+            if isinstance(operator, ScanPhysicalOp):
                 keep_scanning_source_records = True
                 while keep_scanning_source_records:
-                    # construct input DataRecord for DataSourcePhysicalOp
-                    # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
-                    #       it is simply a vessel to inform the scan_operator which record to fetch
-                    candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
-                    candidate.idx = current_scan_idx
-                    candidate.get_item_fn = datasource.get_item
-                    # run DataSourcePhysicalOp on record
-                    record_set = operator(candidate)
+                    # run ScanPhysicalOp on current scan index
+                    record_set = operator(current_scan_idx)
                     records.extend(record_set.data_records)
                     record_op_stats.extend(record_set.record_op_stats)
@@ -84,7 +74,7 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
                     current_scan_idx += 1
                     # update whether to keep scanning source records
-                    keep_scanning_source_records = current_scan_idx < datasource_len and len(records) < num_samples
+                    keep_scanning_source_records = current_scan_idx < datareader_len and len(records) < num_samples
             # aggregate operators accept all input records at once
             elif isinstance(operator, AggregateOp):
@@ -113,7 +103,8 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
             if not self.nocache:
                 for record in records:
                     if getattr(record, "passed_operator", True):
-                        self.datadir.append_cache(operator.target_cache_id, record)
+                        # self.datadir.append_cache(operator.target_cache_id, record)
+                        pass
             # update processing_queues or output_records
             for record in records:
@@ -130,8 +121,9 @@ class SequentialSingleThreadExecutionStrategy(ExecutionStrategy):
         # if caching was allowed, close the cache
         if not self.nocache:
-            for operator in plan.operators:
-                self.datadir.close_cache(operator.target_cache_id)
+            for _ in plan.operators:
+                # self.datadir.close_cache(operator.target_cache_id)
+                pass
         # finalize plan stats
         total_plan_time = time.time() - plan_start_time
@@ -181,14 +173,13 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
         source_records_scanned = 0
         current_scan_idx = self.scan_start_idx
-        # get handle to DataSource and pre-compute its size
+        # get handle to scan operator and pre-compute its size
         source_operator = plan.operators[0]
-        assert isinstance(source_operator, DataSourcePhysicalOp), "First operator in physical plan must be a DataSourcePhysicalOp"
-        datasource = source_operator.get_datasource()
-        datasource_len = len(datasource)
+        assert isinstance(source_operator, ScanPhysicalOp), "First operator in physical plan must be a ScanPhysicalOp"
+        datareader_len = len(source_operator.datareader)
         # initialize processing queues for each operation
-        processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, DataSourcePhysicalOp)}
+        processing_queues = {op.get_op_id(): [] for op in plan.operators if not isinstance(op, ScanPhysicalOp)}
         # execute the plan until either:
         # 1. all records have been processed, or
@@ -204,18 +195,11 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 # create empty lists for records and execution stats generated by executing this operator on its next input(s)
                 records, record_op_stats = [], []
-                # invoke datasource operator(s) until we run out of source records or hit the num_samples limit
-                if isinstance(operator, DataSourcePhysicalOp):
+                # invoke scan operator(s) until we run out of source records or hit the num_samples limit
+                if isinstance(operator, ScanPhysicalOp):
                     if keep_scanning_source_records:
-                        # construct input DataRecord for DataSourcePhysicalOp
-                        # NOTE: this DataRecord will be discarded and replaced by the scan_operator;
-                        #       it is simply a vessel to inform the scan_operator which record to fetch
-                        candidate = DataRecord(schema=SourceRecord, source_id=current_scan_idx)
-                        candidate.idx = current_scan_idx
-                        candidate.get_item_fn = datasource.get_item
-                        # run DataSourcePhysicalOp on record
-                        record_set = operator(candidate)
+                        # run ScanPhysicalOp on current scan index
+                        record_set = operator(current_scan_idx)
                         records = record_set.data_records
                         record_op_stats = record_set.record_op_stats
@@ -230,8 +214,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                 elif isinstance(operator, AggregateOp):
                     upstream_ops_are_finished = True
                     for upstream_op_idx in range(op_idx):
-                        # datasources do not have processing queues
-                        if isinstance(plan.operators[upstream_op_idx], DataSourcePhysicalOp):
+                        # scan operators do not have processing queues
+                        if isinstance(plan.operators[upstream_op_idx], ScanPhysicalOp):
                             continue
                         # check upstream ops which do have a processing queue
@@ -266,7 +250,8 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
                     if not self.nocache:
                         for record in records:
                             if getattr(record, "passed_operator", True):
-                                self.datadir.append_cache(operator.target_cache_id, record)
+                                # self.datadir.append_cache(operator.target_cache_id, record)
+                                pass
                     # update processing_queues or output_records
                     for record in records:
@@ -279,7 +264,7 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
             # update finished_executing based on whether all records have been processed
             still_processing = any([len(queue) > 0 for queue in processing_queues.values()])
-            keep_scanning_source_records = current_scan_idx < datasource_len and source_records_scanned < num_samples
+            keep_scanning_source_records = current_scan_idx < datareader_len and source_records_scanned < num_samples
             finished_executing = not keep_scanning_source_records and not still_processing
             # update finished_executing based on limit
@@ -288,8 +273,9 @@ class PipelinedSingleThreadExecutionStrategy(ExecutionStrategy):
         # if caching was allowed, close the cache
         if not self.nocache:
-            for operator in plan.operators:
-                self.datadir.close_cache(operator.target_cache_id)
+            for _ in plan.operators:
+                # self.datadir.close_cache(operator.target_cache_id)
+                pass
         # finalize plan stats
         total_plan_time = time.time() - plan_start_time

palimpzest 0.5.4__py3-none-any.whl → 0.6.1__py3-none-any.whl

palimpzest 0.5.4py3-none-any.whl → 0.6.1py3-none-any.whl