PyPI - palimpzest - Versions diffs - 0.7.1__tar.gz → 0.7.3__tar.gz - Mend

palimpzest 0.7.1tar.gz → 0.7.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

{palimpzest-0.7.1/src/palimpzest.egg-info → palimpzest-0.7.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.7.1
+Version: 0.7.3
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -15,45 +15,25 @@ Classifier: Programming Language :: Python :: 3.8
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: charset-normalizer>=3.3.2
 Requires-Dist: chromadb>=0.6.3
-Requires-Dist: click>=8.1.7
-Requires-Dist: click-aliases>=1.0.4
-Requires-Dist: colorama>=0.4.6
 Requires-Dist: fastapi~=0.115.0
-Requires-Dist: fuzzywuzzy>=0.18.0
-Requires-Dist: google-generativeai>=0.8.0
-Requires-Dist: gradio>=4.20.1
-Requires-Dist: grobid-client-python==0.0.5
-Requires-Dist: ipython>=8.26.0
-Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
-Requires-Dist: jupyter
-Requires-Dist: layoutparser>=0.3.4
-Requires-Dist: lxml-html-clean>=0.1.1
+Requires-Dist: gradio>=5.26.0
 Requires-Dist: mkdocs>=1.6.1
 Requires-Dist: mkdocs-material>=9.6.3
 Requires-Dist: mkdocs-material[imaging]
 Requires-Dist: mkdocstrings-python>=1.15.0
-Requires-Dist: modal>=0.62.198
-Requires-Dist: ncls==0.0.68
-Requires-Dist: necessary>=0.3.2
 Requires-Dist: numpy>=1.23.2
 Requires-Dist: openai>=1.0
-Requires-Dist: openpyxl==3.1.2
 Requires-Dist: pandas>=2.1.1
-Requires-Dist: papermage>=0.16.0
-Requires-Dist: pdf2image
 Requires-Dist: pytest>=8.2.2
-Requires-Dist: python-Levenshtein
-Requires-Dist: pdfplumber==0.7.4
-Requires-Dist: pillow>=10.2.0
+Requires-Dist: pillow
 Requires-Dist: prettytable>=3.9.0
+Requires-Dist: psutil>=7.0.0
 Requires-Dist: PyLD>=2.0.4
 Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
 Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
 Requires-Dist: pypdf>=5.1.0
 Requires-Dist: pytest-mock>=3.14.0
-Requires-Dist: python-Levenshtein>=0.25.1
 Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: ragatouille>=0.0.9
 Requires-Dist: requests>=2.25
@@ -64,7 +44,6 @@ Requires-Dist: together>=1.3.1
 Requires-Dist: tqdm~=4.66.1
 Requires-Dist: transformers<4.50.0,>=4.41.3
 Requires-Dist: rich[jupyter]>=13.9.2
-Requires-Dist: voyager>=2.0.9
 Dynamic: license-file
 ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)

{palimpzest-0.7.1 → palimpzest-0.7.3}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "0.7.1"
+version = "0.7.3"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.8"
@@ -9,45 +9,25 @@ authors = [
     {name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
 ]
 dependencies = [
-    "charset-normalizer>=3.3.2",
     "chromadb>=0.6.3",
-    "click>=8.1.7",
-    "click-aliases>=1.0.4",
-    "colorama>=0.4.6",
     "fastapi~=0.115.0",
-    "fuzzywuzzy>=0.18.0",
-    "google-generativeai>=0.8.0",
-    "gradio>=4.20.1",
-    "grobid-client-python==0.0.5",
-    "ipython>=8.26.0",
-    "opencv-python-headless>=4.8.0,<4.9.0",
-    "jupyter",
-    "layoutparser>=0.3.4",
-    "lxml-html-clean>=0.1.1",
+    "gradio>=5.26.0",
     "mkdocs>=1.6.1",
     "mkdocs-material>=9.6.3",
     "mkdocs-material[imaging]",
     "mkdocstrings-python>=1.15.0",
-    "modal>=0.62.198",
-    "ncls==0.0.68",
-    "necessary>=0.3.2",
     "numpy>=1.23.2",
     "openai>=1.0",
-    "openpyxl==3.1.2",
     "pandas>=2.1.1",
-    "papermage>=0.16.0",
-    "pdf2image",
     "pytest>=8.2.2",
-    "python-Levenshtein",
-    "pdfplumber==0.7.4",
-    "pillow>=10.2.0",
+    "pillow",
     "prettytable>=3.9.0",
+    "psutil>=7.0.0",
     "PyLD>=2.0.4",
     "pyarrow>=13.0.0,<15.0.0; python_version<'3.12'",
     "pyarrow>=15.0.0,<19.0.0; python_version>='3.12'",
     "pypdf>=5.1.0",
     "pytest-mock>=3.14.0",
-    "python-Levenshtein>=0.25.1",
     "pyyaml>=6.0.1",
     "ragatouille>=0.0.9",
     "requests>=2.25",
@@ -58,7 +38,6 @@ dependencies = [
     "tqdm~=4.66.1",
     "transformers>=4.41.3,<4.50.0",
     "rich[jupyter]>=13.9.2",
-    "voyager>=2.0.9",
 ]
 classifiers=[
     "Development Status :: 4 - Beta",  # Change as appropriate

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/datareaders.py RENAMED Viewed

@@ -1,15 +1,12 @@
 from __future__ import annotations
 import base64
-import json
 import os
 from abc import ABC, abstractmethod
 from io import BytesIO
-import modal
 import pandas as pd
 from bs4 import BeautifulSoup
-from papermage import Document
 from palimpzest import constants
 from palimpzest.core.lib.schemas import (
@@ -390,22 +387,8 @@ class PDFFileDirectoryReader(DirectoryReader):
         with open(filepath, "rb") as f:
             pdf_bytes = f.read()
-        if self.pdfprocessor == "modal":
-            print("handling PDF processing remotely")
-            remote_func = modal.Function.lookup("palimpzest.tools", "processPapermagePdf")
-        else:
-            remote_func = None
         # generate text_content from PDF
-        if remote_func is not None:
-            doc_json_str = remote_func.remote([pdf_bytes])
-            docdict = json.loads(doc_json_str[0])
-            doc = Document.from_json(docdict)
-            text_content = ""
-            for p in doc.pages:
-                text_content += p.text
-        else:
-            text_content = get_text_from_pdf(pdf_filename, pdf_bytes, pdfprocessor=self.pdfprocessor, file_cache_dir=self.file_cache_dir)
+        text_content = get_text_from_pdf(pdf_filename, pdf_bytes, pdfprocessor=self.pdfprocessor, file_cache_dir=self.file_cache_dir)
         # construct and return item
         return {"filename": pdf_filename, "contents": pdf_bytes, "text_contents": text_content}

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/rag_convert.py RENAMED Viewed

@@ -64,8 +64,7 @@ class RAGConvert(LLMConvert):
             + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
         )
-        # set refined estimate of cost per record and, for now,
-        # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
+        # set refined estimate of cost per record
         naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
         naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
         naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/split_convert.py RENAMED Viewed

@@ -61,8 +61,7 @@ class SplitConvert(LLMConvert):
             + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
         )
-        # set refined estimate of cost per record and, for now,
-        # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
+        # set refined estimate of cost per record
         naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
         naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
         naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/__init__.py RENAMED Viewed

@@ -46,9 +46,6 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     SplitConvertRule as _SplitConvertRule,
 )
-from palimpzest.query.optimizer.rules import (
-    TokenReducedConvertBondedRule as _TokenReducedConvertBondedRule,
-)
 from palimpzest.query.optimizer.rules import (
     TransformationRule as _TransformationRule,
 )
@@ -70,7 +67,6 @@ ALL_RULES = [
     _RetrieveRule,
     _Rule,
     _SplitConvertRule,
-    _TokenReducedConvertBondedRule,
     _TransformationRule,
 ]

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/cost_model.py RENAMED Viewed

@@ -1,7 +1,6 @@
 from __future__ import annotations
 import logging
-import math
 # NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
 #       answers to a convert with the same mode. This is because pandas tries to sort the answers
@@ -24,7 +23,6 @@ from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.rag_convert import RAGConvert
 from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
-from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
 from palimpzest.utils.model_helpers import get_champion_model_name, get_models
 warnings.simplefilter(action='ignore', category=UserWarning)
@@ -574,16 +572,6 @@ class CostModel(BaseCostModel):
                     op_estimates.cost_per_record = 1e-4
                     op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
-                # token reduction adjustment
-                if isinstance(operator, TokenReducedConvertBonded):
-                    total_input_tokens = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens"]
-                    total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
-                    op_estimates.cost_per_record = (
-                        MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
-                        + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
-                    )
-                    op_estimates.quality = op_estimates.quality * math.sqrt(math.sqrt(operator.token_budget))
                 # rag convert adjustment
                 if isinstance(operator, RAGConvert):
                     total_input_tokens = operator.num_chunks_per_field * operator.chunk_size

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer.py RENAMED Viewed

@@ -34,7 +34,6 @@ from palimpzest.query.optimizer.rules import (
     MixtureOfAgentsConvertRule,
     RAGConvertRule,
     SplitConvertRule,
-    TokenReducedConvertBondedRule,
 )
 from palimpzest.query.optimizer.tasks import (
     ApplyRule,
@@ -90,7 +89,6 @@ class Optimizer:
         verbose: bool = False,
         allow_bonded_query: bool = True,
         allow_code_synth: bool = False,
-        allow_token_reduction: bool = False,
         allow_rag_reduction: bool = False,
         allow_mixtures: bool = True,
         allow_critic: bool = False,
@@ -134,7 +132,6 @@ class Optimizer:
         if optimizer_strategy == OptimizationStrategyType.NONE:
             self.allow_bonded_query = True
             self.allow_code_synth = False
-            self.allow_token_reduction = False
             self.allow_rag_reduction = False
             self.allow_mixtures = False
             self.allow_critic = False
@@ -147,7 +144,6 @@ class Optimizer:
         self.available_models = available_models
         self.allow_bonded_query = allow_bonded_query
         self.allow_code_synth = allow_code_synth
-        self.allow_token_reduction = allow_token_reduction
         self.allow_rag_reduction = allow_rag_reduction
         self.allow_mixtures = allow_mixtures
         self.allow_critic = allow_critic
@@ -160,7 +156,7 @@ class Optimizer:
             self.implementation_rules = [
                 rule
                 for rule in self.implementation_rules
-                if rule not in [LLMConvertBondedRule, TokenReducedConvertBondedRule]
+                if rule not in [LLMConvertBondedRule]
             ]
         if not self.allow_code_synth:
@@ -168,11 +164,6 @@ class Optimizer:
                 rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
             ]
-        if not self.allow_token_reduction:
-            self.implementation_rules = [
-                rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertBondedRule)
-            ]
         if not self.allow_rag_reduction:
             self.implementation_rules = [
                 rule for rule in self.implementation_rules if not issubclass(rule, RAGConvertRule)
@@ -218,7 +209,6 @@ class Optimizer:
             available_models=self.available_models,
             allow_bonded_query=self.allow_bonded_query,
             allow_code_synth=self.allow_code_synth,
-            allow_token_reduction=self.allow_token_reduction,
             allow_rag_reduction=self.allow_rag_reduction,
             allow_mixtures=self.allow_mixtures,
             allow_critic=self.allow_critic,

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/rules.py RENAMED Viewed

@@ -28,7 +28,6 @@ from palimpzest.query.operators.rag_convert import RAGConvert
 from palimpzest.query.operators.retrieve import RetrieveOp
 from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
 from palimpzest.query.operators.split_convert import SplitConvert
-from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
 from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
 from palimpzest.utils.model_helpers import get_models, get_vision_models
@@ -352,81 +351,6 @@ class LLMConvertBondedRule(ImplementationRule):
         return deduped_physical_expressions
-class TokenReducedConvertBondedRule(ImplementationRule):
-    """
-    Substitute a logical expression for a ConvertScan with a bonded token reduced physical implementation.
-    """
-    token_budgets = [0.1, 0.5, 0.9]
-    @classmethod
-    def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        logical_op = logical_expression.operator
-        is_image_conversion = any(
-            [
-                field.is_image_field
-                for field_name, field in logical_expression.input_fields.items()
-                if field_name.split(".")[-1] in logical_expression.depends_on_field_names
-            ]
-        )
-        is_match = isinstance(logical_op, ConvertScan) and not is_image_conversion and logical_op.udf is None
-        logger.debug(f"TokenReducedConvertBondedRule matches_pattern: {is_match} for {logical_expression}")
-        return is_match
-    @classmethod
-    def substitute(cls, logical_expression: LogicalExpression, **physical_op_params) -> set[PhysicalExpression]:
-        logger.debug(f"Substituting TokenReducedConvertBondedRule for {logical_expression}")
-        logical_op = logical_expression.operator
-        # get initial set of parameters for physical op
-        op_kwargs = logical_op.get_logical_op_params()
-        op_kwargs.update(
-            {
-                "verbose": physical_op_params["verbose"],
-                "logical_op_id": logical_op.get_logical_op_id(),
-                "logical_op_name": logical_op.logical_op_name(),
-            }
-        )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
-        # identify models which can be used strictly for text or strictly for images
-        vision_models = set(get_vision_models())
-        text_models = set(get_models())
-        pure_vision_models = {model for model in vision_models if model not in text_models}
-        physical_expressions = []
-        for model in physical_op_params["available_models"]:
-            for token_budget in cls.token_budgets:
-                # skip this model if this is a pure image model
-                if model in pure_vision_models:
-                    continue
-                # construct multi-expression
-                op = TokenReducedConvertBonded(
-                    model=model,
-                    prompt_strategy=PromptStrategy.COT_QA,
-                    token_budget=token_budget,
-                    **op_kwargs,
-                )
-                expression = PhysicalExpression(
-                    operator=op,
-                    input_group_ids=logical_expression.input_group_ids,
-                    input_fields=logical_expression.input_fields,
-                    depends_on_field_names=logical_expression.depends_on_field_names,
-                    generated_fields=logical_expression.generated_fields,
-                    group_id=logical_expression.group_id,
-                )
-                physical_expressions.append(expression)
-        logger.debug(f"Done substituting TokenReducedConvertBondedRule for {logical_expression}")
-        deduped_physical_expressions = set(physical_expressions)
-        return deduped_physical_expressions
 class CodeSynthesisConvertRule(ImplementationRule):
     """
     Base rule for code synthesis convert operators; the physical convert class

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/config.py RENAMED Viewed

@@ -31,7 +31,6 @@ class QueryProcessorConfig:
     allow_bonded_query: bool = field(default=True)
     allow_model_selection: bool = field(default=True)
     allow_code_synth: bool = field(default=False)
-    allow_token_reduction: bool = field(default=False)
     allow_rag_reduction: bool = field(default=True)
     allow_mixtures: bool = field(default=True)
     allow_critic: bool = field(default=True)
@@ -59,7 +58,6 @@ class QueryProcessorConfig:
             "allow_bonded_query": self.allow_bonded_query,
             "allow_model_selection": self.allow_model_selection,
             "allow_code_synth": self.allow_code_synth,
-            "allow_token_reduction": self.allow_token_reduction,
             "allow_rag_reduction": self.allow_rag_reduction,
             "allow_mixtures": self.allow_mixtures,
             "allow_critic": self.allow_critic,

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/demo_helpers.py RENAMED Viewed

@@ -47,8 +47,6 @@ def create_plan_str(flatten_ops):
                     else str(right.filter.filter_fn)
                 )
                 plan_str += f'\n    Filter: "{filter_str}"'
-            if hasattr(right, "token_budget"):
-                plan_str += f"\n    Token budget: {right.token_budget}"
         plan_str += "\n"
         plan_str += (
             f"    ({','.join(in_schema.field_names())[:15]}...) -> ({','.join(out_schema.field_names())[:15]}...)"

{palimpzest-0.7.1 → palimpzest-0.7.3/src/palimpzest.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: palimpzest
-Version: 0.7.1
+Version: 0.7.3
 Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
 Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
 Project-URL: homepage, https://palimpzest.org
@@ -15,45 +15,25 @@ Classifier: Programming Language :: Python :: 3.8
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: charset-normalizer>=3.3.2
 Requires-Dist: chromadb>=0.6.3
-Requires-Dist: click>=8.1.7
-Requires-Dist: click-aliases>=1.0.4
-Requires-Dist: colorama>=0.4.6
 Requires-Dist: fastapi~=0.115.0
-Requires-Dist: fuzzywuzzy>=0.18.0
-Requires-Dist: google-generativeai>=0.8.0
-Requires-Dist: gradio>=4.20.1
-Requires-Dist: grobid-client-python==0.0.5
-Requires-Dist: ipython>=8.26.0
-Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
-Requires-Dist: jupyter
-Requires-Dist: layoutparser>=0.3.4
-Requires-Dist: lxml-html-clean>=0.1.1
+Requires-Dist: gradio>=5.26.0
 Requires-Dist: mkdocs>=1.6.1
 Requires-Dist: mkdocs-material>=9.6.3
 Requires-Dist: mkdocs-material[imaging]
 Requires-Dist: mkdocstrings-python>=1.15.0
-Requires-Dist: modal>=0.62.198
-Requires-Dist: ncls==0.0.68
-Requires-Dist: necessary>=0.3.2
 Requires-Dist: numpy>=1.23.2
 Requires-Dist: openai>=1.0
-Requires-Dist: openpyxl==3.1.2
 Requires-Dist: pandas>=2.1.1
-Requires-Dist: papermage>=0.16.0
-Requires-Dist: pdf2image
 Requires-Dist: pytest>=8.2.2
-Requires-Dist: python-Levenshtein
-Requires-Dist: pdfplumber==0.7.4
-Requires-Dist: pillow>=10.2.0
+Requires-Dist: pillow
 Requires-Dist: prettytable>=3.9.0
+Requires-Dist: psutil>=7.0.0
 Requires-Dist: PyLD>=2.0.4
 Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
 Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
 Requires-Dist: pypdf>=5.1.0
 Requires-Dist: pytest-mock>=3.14.0
-Requires-Dist: python-Levenshtein>=0.25.1
 Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: ragatouille>=0.0.9
 Requires-Dist: requests>=2.25
@@ -64,7 +44,6 @@ Requires-Dist: together>=1.3.1
 Requires-Dist: tqdm~=4.66.1
 Requires-Dist: transformers<4.50.0,>=4.41.3
 Requires-Dist: rich[jupyter]>=13.9.2
-Requires-Dist: voyager>=2.0.9
 Dynamic: license-file
 ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/SOURCES.txt RENAMED Viewed

@@ -60,7 +60,6 @@ src/palimpzest/query/operators/rag_convert.py
 src/palimpzest/query/operators/retrieve.py
 src/palimpzest/query/operators/scan.py
 src/palimpzest/query/operators/split_convert.py
-src/palimpzest/query/operators/token_reduction_convert.py
 src/palimpzest/query/optimizer/__init__.py
 src/palimpzest/query/optimizer/cost_model.py
 src/palimpzest/query/optimizer/optimizer.py
@@ -95,5 +94,4 @@ src/palimpzest/utils/hash_helpers.py
 src/palimpzest/utils/model_helpers.py
 src/palimpzest/utils/progress.py
 src/palimpzest/utils/sandbox.py
-src/palimpzest/utils/token_reduction_helpers.py
 src/palimpzest/utils/udfs.py

{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/requires.txt RENAMED Viewed

@@ -1,40 +1,20 @@
-charset-normalizer>=3.3.2
 chromadb>=0.6.3
-click>=8.1.7
-click-aliases>=1.0.4
-colorama>=0.4.6
 fastapi~=0.115.0
-fuzzywuzzy>=0.18.0
-google-generativeai>=0.8.0
-gradio>=4.20.1
-grobid-client-python==0.0.5
-ipython>=8.26.0
-opencv-python-headless<4.9.0,>=4.8.0
-jupyter
-layoutparser>=0.3.4
-lxml-html-clean>=0.1.1
+gradio>=5.26.0
 mkdocs>=1.6.1
 mkdocs-material>=9.6.3
 mkdocs-material[imaging]
 mkdocstrings-python>=1.15.0
-modal>=0.62.198
-ncls==0.0.68
-necessary>=0.3.2
 numpy>=1.23.2
 openai>=1.0
-openpyxl==3.1.2
 pandas>=2.1.1
-papermage>=0.16.0
-pdf2image
 pytest>=8.2.2
-python-Levenshtein
-pdfplumber==0.7.4
-pillow>=10.2.0
+pillow
 prettytable>=3.9.0
+psutil>=7.0.0
 PyLD>=2.0.4
 pypdf>=5.1.0
 pytest-mock>=3.14.0
-python-Levenshtein>=0.25.1
 pyyaml>=6.0.1
 ragatouille>=0.0.9
 requests>=2.25
@@ -45,7 +25,6 @@ together>=1.3.1
 tqdm~=4.66.1
 transformers<4.50.0,>=4.41.3
 rich[jupyter]>=13.9.2
-voyager>=2.0.9
 [:python_version < "3.12"]
 pyarrow<15.0.0,>=13.0.0

palimpzest-0.7.1/src/palimpzest/query/operators/token_reduction_convert.py DELETED Viewed

@@ -1,169 +0,0 @@
-from __future__ import annotations
-import math
-from typing import Any
-from palimpzest.constants import (
-    MODEL_CARDS,
-    NAIVE_EST_NUM_INPUT_TOKENS,
-    NAIVE_EST_NUM_OUTPUT_TOKENS,
-)
-from palimpzest.core.data.dataclasses import OperatorCostEstimates
-from palimpzest.query.operators.convert import LLMConvertBonded
-from palimpzest.utils.token_reduction_helpers import best_substring_match, find_best_range
-# NOTE: this convert operation will not work with the new generation abstraction, and it needs to be worked on.
-#       There are two minor issues with the operator as it exists:
-#
-#       1) The token reduction operation operated over the entire JSON string of the input DataRecord
-#          - while this works in practice, it makes it difficult to use this operator with a generation framework
-#            where each field may be placed in a specific place in the format string for a prompt
-#          - we need to either (A) rewrite the reduction to take place on a field-by-field basis (or at least
-#            make it possible to recover each field after a global reduction) or (B) add custom logic within
-#            the Generator class(es) to handle this operator [I much prefer (A) over (B)]
-#
-#       2) The heatmap update logic does not translate well to the distributed setting, where this operator may
-#          be copied and executed many times in parallel
-#          - each copy of the operator will have its own heatmap and require MAX_HEATMAP_UPDATES just to enter the
-#            phase where token reduction takes place
-#          - this means that if we have 20-way parallelism and a MAX_HEATMAP_UPDATES = 5, it can take 100 inputs
-#            before token reduction ever takes place
-#          - this also creates difficulties in properly performing cost-estimation for this operator; e.g. if we use
-#            n <= MAX_HEATMAP_UPDATES samples to cost this operator, then we will never actually measure its performance
-#            in the token reduction phase -- which could have a serious degradation in quality that our optimizer doesn't see
-class TokenReducedConvertBonded(LLMConvertBonded):
-    # NOTE: moving these closer to the TokenReducedConvertBonded class for now (in part to make
-    #       them easier to mock); we can make these parameterized as well
-    MAX_HEATMAP_UPDATES: int = 5
-    TOKEN_REDUCTION_SAMPLE: int = 0
-    TOKEN_REDUCTION_GRANULARITY: float = 0.001
-    def __init__(self, token_budget: float, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.token_budget = token_budget
-        self.resolution = self.TOKEN_REDUCTION_GRANULARITY
-        self.first_execution = True
-        self.count = 0
-        self.heatmap = [0] * int(1.0 / self.resolution)
-    def __str__(self):
-        op = super().__str__()
-        op += f"    Token Budget: {str(self.token_budget)}\n"
-        return op
-    def get_id_params(self):
-        id_params = super().get_id_params()
-        id_params = {"token_budget": self.token_budget, **id_params}
-        return id_params
-    def get_op_params(self):
-        op_params = super().get_op_params()
-        return {"token_budget": self.token_budget, **op_params}
-    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
-        """
-        Update the cost per record and quality estimates produced by LLMConvert's naive estimates.
-        We adjust the cost per record to account for the reduced number of input tokens following
-        token reduction, and we make a crude estimate of the quality degradation that results from
-        using fewer tokens.
-        """
-        # get naive cost estimates from LLMConvert
-        naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
-        # re-compute cost per record assuming we use fewer input tokens
-        est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * self.token_budget
-        est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
-        model_conversion_usd_per_record = (
-            MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
-            + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
-        )
-        # set refined estimate of cost per record and, for now,
-        # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
-        naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
-        naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
-        naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
-        naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * math.sqrt(math.sqrt(self.token_budget))
-        naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
-        naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
-        return naive_op_cost_estimates
-    def is_image_conversion(self) -> bool:
-        """TokenReducedConvertBonded is currently disallowed on image conversions, so this must be False."""
-        return False
-    def reduce_context(self, full_context: str) -> str:
-        range = find_best_range(
-            self.heatmap,
-            int(self.token_budget / self.TOKEN_REDUCTION_GRANULARITY),
-            trim_zeros=False,
-        )
-        if not range:
-            raise Exception("No range found in heatmap")
-        si, ei = range
-        print("si:", si, "ei:", ei)
-        sr, er = (
-            si * self.TOKEN_REDUCTION_GRANULARITY,
-            ei * self.TOKEN_REDUCTION_GRANULARITY,
-        )
-        test_len = len(full_context)
-        start = int(sr * test_len)
-        end = int(er * test_len)
-        if self.verbose:
-            print(f"start ratio: {sr} -- end ratio: {er}")
-            print("character start:", start, "end:", end)
-        sample = full_context[start:end]
-        return sample
-    def _dspy_generate_fields(self, prompt: str, content: str | list[str]) -> tuple[list[dict[str, list]] | Any]:
-        raise Exception(
-            "TokenReducedConvertBonded is executing despite being deprecated until implementation changes can be made."
-        )
-        answer, query_stats = None, None
-        if self.first_execution or self.count < self.MAX_HEATMAP_UPDATES:
-            if self.verbose:
-                print("Warming up heatmap")
-            answer, query_stats = super()._dspy_generate_fields(prompt, content)
-            self.first_execution = False
-        else:
-            if self.verbose:
-                print("Using heatmap")
-            # only refer to the heatmap if the count is greater than a enough sample size
-            # TODO: only trim the context if the attention is clustered in a small region
-            if self.count >= self.TOKEN_REDUCTION_SAMPLE:
-                context = self.reduce_context(content)
-                try:
-                    answer, _, query_stats = self.generator.generate(context=context, prompt=prompt)
-                except Exception as e:
-                    print(f"DSPy generation error: {e}, falling back to unreduced generation")
-                    answer, query_stats = super()._dspy_generate_fields(prompt, content)
-        # TODO: answer and query stats may be unbound if we hit the else block
-        # and count < TOKEN_REDUCTION_SAMPLE, which makes the below pretty clunky
-        # this throw asserts our view of the world and we should refactor this
-        if answer is None or query_stats is None:
-            raise Exception("answer or query_stats is None")
-        try:
-            match = best_substring_match(answer, content)
-            if not match:
-                gsi, gei = 0, len(content)
-            else:
-                gsi, gei = match
-        except Exception as e:
-            print("Error in substring match:", e)
-            gsi, gei = 0, len(content)
-        context_len = len(content)
-        gsr, ger = gsi / context_len, gei / context_len
-        norm_si, norm_ei = int(gsr / self.resolution), int(ger / self.resolution)
-        if self.verbose:
-            print(f"best_start: {gsi} -- best_end: {gei}")
-        self.count += 1
-        self.heatmap[norm_si:norm_ei] = map(lambda x: x + 1, self.heatmap[norm_si:norm_ei])
-        return answer, query_stats

palimpzest-0.7.1/src/palimpzest/utils/token_reduction_helpers.py DELETED Viewed

@@ -1,105 +0,0 @@
-from fuzzywuzzy import fuzz, process
-def find_best_range(values, budget, trim_zeros=False):
-    """
-    Finds the consecutive range with the biggest sum within a budget.
-    Args:
-        values: A list of non-negative numbers.
-        budget: The maximum number of consecutive elements to consider.
-    Returns:
-        A tuple containing the start and end indices (inclusive) of the best range,
-        or None if the array is empty.
-    """
-    if not values:
-        return None
-    n = len(values)
-    best_sum, best_start, current_sum, current_start = 0, 0, 0, 0
-    # Iterate through the array, keeping track of current and best ranges.
-    for i in range(n):
-        current_sum += values[i]
-        # If the current range exceeds the budget, remove elements from the beginning.
-        while current_start + budget - 1 < i and current_start + budget - 1 >= 0:
-            current_sum -= values[current_start]
-            current_start += 1
-        # Update best range if the current sum is bigger.
-        if current_sum > best_sum:
-            best_sum = current_sum
-            best_start = current_start
-    best_end = best_start + budget - 1
-    print("best_start:", best_start, "best_end:", best_end)
-    if trim_zeros:
-        # Trim leading/trailing zeros
-        while best_start >= 0 and values[best_start] == 0:
-            best_start += 1
-        while best_end < n and values[best_end] == 0:
-            best_end -= 1
-    else:
-        # balance the zero entries equally on both sides
-        leading_zeros = 0
-        trailing_zeros = 0
-        start_idx = best_start
-        end_idx = best_end
-        while start_idx >= 0 and values[start_idx] == 0:
-            leading_zeros += 1
-            start_idx += 1
-        while end_idx < n and values[end_idx] == 0:
-            trailing_zeros += 1
-            end_idx -= 1
-        half_zeros = int((leading_zeros + trailing_zeros) / 2)
-        print("leading_zeros:", leading_zeros, "trailing_zeros:", trailing_zeros, "half_zeros:", half_zeros)
-        best_start = best_start - half_zeros + leading_zeros
-        best_end = best_end - trailing_zeros + leading_zeros + trailing_zeros - half_zeros
-        if best_start < 0:
-            best_end = best_end - best_start
-            best_start = 0
-        if best_end >= n:
-            best_start = best_start - (best_end - n + 1)
-            best_end = n - 1
-    return best_start, best_end + 1
-def get_range_from_hist(file_path, range_budget, resolution=0.001, trim_zeros=True):
-    # Load data from csv file and extract he second column as values
-    values = []
-    with open(file_path) as file:
-        for line in file:
-            line = line.strip()
-            values.append(int(float(line.split(",")[1])))
-    index_range = 1 / resolution
-    budget = int(range_budget * index_range)
-    # Find the best range
-    range = find_best_range(values, budget, trim_zeros=trim_zeros)
-    if not range:
-        raise ValueError("No range found")
-    start, end = range
-    print("start:", start, "end:", end, "index_range:", index_range)
-    return start * 1.0 / index_range, end * 1.0 / index_range
-def best_substring_match(query: str, context: str | list[str]):
-    # This will extract all substrings of length equal to the query from the string
-    candidates = [context[i : i + len(query)] for i in range(len(context) - len(query) + 1)]
-    # Find the best match among the candidates
-    ret = process.extractOne(query, candidates, scorer=fuzz.ratio)
-    if ret is None:
-        return None
-    best_match, score = ret
-    positions = [can == best_match for can in candidates]
-    start = positions.index(True)
-    end = start + len(query)
-    # print("best match:", best_match, "score:", score, "start:", start, "end:", end)
-    # print("-------", string[start:end])
-    return start, end