palimpzest 0.7.1__tar.gz → 0.7.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {palimpzest-0.7.1/src/palimpzest.egg-info → palimpzest-0.7.3}/PKG-INFO +4 -25
- {palimpzest-0.7.1 → palimpzest-0.7.3}/pyproject.toml +4 -25
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/datareaders.py +1 -18
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/rag_convert.py +1 -2
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/split_convert.py +1 -2
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/__init__.py +0 -4
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/cost_model.py +0 -12
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer.py +1 -11
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/rules.py +0 -76
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/config.py +0 -2
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/demo_helpers.py +0 -2
- {palimpzest-0.7.1 → palimpzest-0.7.3/src/palimpzest.egg-info}/PKG-INFO +4 -25
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/SOURCES.txt +0 -2
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/requires.txt +3 -24
- palimpzest-0.7.1/src/palimpzest/query/operators/token_reduction_convert.py +0 -169
- palimpzest-0.7.1/src/palimpzest/utils/token_reduction_helpers.py +0 -105
- {palimpzest-0.7.1 → palimpzest-0.7.3}/LICENSE +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/README.md +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/setup.cfg +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/constants.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/dataclasses.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/filters.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/groupbysig.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/index.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/records.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/lib/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/lib/fields.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/lib/schemas.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/policy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/code_synthesis_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/convert_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/filter_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/moa_aggregator_convert_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/moa_proposer_convert_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/prompt_factory.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/split_merge_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/split_proposer_prompts.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/util_phrases.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/execution_strategy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/mab_execution_strategy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/parallel_execution_strategy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/random_sampling_execution_strategy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/generators/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/generators/api_client_factory.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/generators/generators.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/aggregate.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/code_synthesis_convert.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/convert.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/critique_and_refine_convert.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/filter.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/limit.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/logical.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/map.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/mixture_of_agents_convert.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/physical.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/project.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/retrieve.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/scan.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/plan.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/primitives.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/tasks.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/nosentinel_processor.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/processing_strategy_type.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/query_processor.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/query_processor_factory.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/sentinel_processor.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/streaming_processor.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/schemabuilder/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/sets.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/README.md +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/allenpdf.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/pdfparser.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/skema_tools.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/__init__.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/datareader_helpers.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/env_helpers.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/field_helpers.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/generation_helpers.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/hash_helpers.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/model_helpers.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/progress.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/sandbox.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/udfs.py +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/dependency_links.txt +0 -0
- {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -15,45 +15,25 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
15
15
|
Requires-Python: >=3.8
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
-
Requires-Dist: charset-normalizer>=3.3.2
|
|
19
18
|
Requires-Dist: chromadb>=0.6.3
|
|
20
|
-
Requires-Dist: click>=8.1.7
|
|
21
|
-
Requires-Dist: click-aliases>=1.0.4
|
|
22
|
-
Requires-Dist: colorama>=0.4.6
|
|
23
19
|
Requires-Dist: fastapi~=0.115.0
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
-
Requires-Dist: gradio>=4.20.1
|
|
27
|
-
Requires-Dist: grobid-client-python==0.0.5
|
|
28
|
-
Requires-Dist: ipython>=8.26.0
|
|
29
|
-
Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
|
|
30
|
-
Requires-Dist: jupyter
|
|
31
|
-
Requires-Dist: layoutparser>=0.3.4
|
|
32
|
-
Requires-Dist: lxml-html-clean>=0.1.1
|
|
20
|
+
Requires-Dist: gradio>=5.26.0
|
|
33
21
|
Requires-Dist: mkdocs>=1.6.1
|
|
34
22
|
Requires-Dist: mkdocs-material>=9.6.3
|
|
35
23
|
Requires-Dist: mkdocs-material[imaging]
|
|
36
24
|
Requires-Dist: mkdocstrings-python>=1.15.0
|
|
37
|
-
Requires-Dist: modal>=0.62.198
|
|
38
|
-
Requires-Dist: ncls==0.0.68
|
|
39
|
-
Requires-Dist: necessary>=0.3.2
|
|
40
25
|
Requires-Dist: numpy>=1.23.2
|
|
41
26
|
Requires-Dist: openai>=1.0
|
|
42
|
-
Requires-Dist: openpyxl==3.1.2
|
|
43
27
|
Requires-Dist: pandas>=2.1.1
|
|
44
|
-
Requires-Dist: papermage>=0.16.0
|
|
45
|
-
Requires-Dist: pdf2image
|
|
46
28
|
Requires-Dist: pytest>=8.2.2
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist: pdfplumber==0.7.4
|
|
49
|
-
Requires-Dist: pillow>=10.2.0
|
|
29
|
+
Requires-Dist: pillow
|
|
50
30
|
Requires-Dist: prettytable>=3.9.0
|
|
31
|
+
Requires-Dist: psutil>=7.0.0
|
|
51
32
|
Requires-Dist: PyLD>=2.0.4
|
|
52
33
|
Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
|
|
53
34
|
Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
|
|
54
35
|
Requires-Dist: pypdf>=5.1.0
|
|
55
36
|
Requires-Dist: pytest-mock>=3.14.0
|
|
56
|
-
Requires-Dist: python-Levenshtein>=0.25.1
|
|
57
37
|
Requires-Dist: pyyaml>=6.0.1
|
|
58
38
|
Requires-Dist: ragatouille>=0.0.9
|
|
59
39
|
Requires-Dist: requests>=2.25
|
|
@@ -64,7 +44,6 @@ Requires-Dist: together>=1.3.1
|
|
|
64
44
|
Requires-Dist: tqdm~=4.66.1
|
|
65
45
|
Requires-Dist: transformers<4.50.0,>=4.41.3
|
|
66
46
|
Requires-Dist: rich[jupyter]>=13.9.2
|
|
67
|
-
Requires-Dist: voyager>=2.0.9
|
|
68
47
|
Dynamic: license-file
|
|
69
48
|
|
|
70
49
|

|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "palimpzest"
|
|
3
|
-
version = "0.7.
|
|
3
|
+
version = "0.7.3"
|
|
4
4
|
description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.8"
|
|
@@ -9,45 +9,25 @@ authors = [
|
|
|
9
9
|
{name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
|
|
10
10
|
]
|
|
11
11
|
dependencies = [
|
|
12
|
-
"charset-normalizer>=3.3.2",
|
|
13
12
|
"chromadb>=0.6.3",
|
|
14
|
-
"click>=8.1.7",
|
|
15
|
-
"click-aliases>=1.0.4",
|
|
16
|
-
"colorama>=0.4.6",
|
|
17
13
|
"fastapi~=0.115.0",
|
|
18
|
-
"
|
|
19
|
-
"google-generativeai>=0.8.0",
|
|
20
|
-
"gradio>=4.20.1",
|
|
21
|
-
"grobid-client-python==0.0.5",
|
|
22
|
-
"ipython>=8.26.0",
|
|
23
|
-
"opencv-python-headless>=4.8.0,<4.9.0",
|
|
24
|
-
"jupyter",
|
|
25
|
-
"layoutparser>=0.3.4",
|
|
26
|
-
"lxml-html-clean>=0.1.1",
|
|
14
|
+
"gradio>=5.26.0",
|
|
27
15
|
"mkdocs>=1.6.1",
|
|
28
16
|
"mkdocs-material>=9.6.3",
|
|
29
17
|
"mkdocs-material[imaging]",
|
|
30
18
|
"mkdocstrings-python>=1.15.0",
|
|
31
|
-
"modal>=0.62.198",
|
|
32
|
-
"ncls==0.0.68",
|
|
33
|
-
"necessary>=0.3.2",
|
|
34
19
|
"numpy>=1.23.2",
|
|
35
20
|
"openai>=1.0",
|
|
36
|
-
"openpyxl==3.1.2",
|
|
37
21
|
"pandas>=2.1.1",
|
|
38
|
-
"papermage>=0.16.0",
|
|
39
|
-
"pdf2image",
|
|
40
22
|
"pytest>=8.2.2",
|
|
41
|
-
"
|
|
42
|
-
"pdfplumber==0.7.4",
|
|
43
|
-
"pillow>=10.2.0",
|
|
23
|
+
"pillow",
|
|
44
24
|
"prettytable>=3.9.0",
|
|
25
|
+
"psutil>=7.0.0",
|
|
45
26
|
"PyLD>=2.0.4",
|
|
46
27
|
"pyarrow>=13.0.0,<15.0.0; python_version<'3.12'",
|
|
47
28
|
"pyarrow>=15.0.0,<19.0.0; python_version>='3.12'",
|
|
48
29
|
"pypdf>=5.1.0",
|
|
49
30
|
"pytest-mock>=3.14.0",
|
|
50
|
-
"python-Levenshtein>=0.25.1",
|
|
51
31
|
"pyyaml>=6.0.1",
|
|
52
32
|
"ragatouille>=0.0.9",
|
|
53
33
|
"requests>=2.25",
|
|
@@ -58,7 +38,6 @@ dependencies = [
|
|
|
58
38
|
"tqdm~=4.66.1",
|
|
59
39
|
"transformers>=4.41.3,<4.50.0",
|
|
60
40
|
"rich[jupyter]>=13.9.2",
|
|
61
|
-
"voyager>=2.0.9",
|
|
62
41
|
]
|
|
63
42
|
classifiers=[
|
|
64
43
|
"Development Status :: 4 - Beta", # Change as appropriate
|
|
@@ -1,15 +1,12 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import base64
|
|
4
|
-
import json
|
|
5
4
|
import os
|
|
6
5
|
from abc import ABC, abstractmethod
|
|
7
6
|
from io import BytesIO
|
|
8
7
|
|
|
9
|
-
import modal
|
|
10
8
|
import pandas as pd
|
|
11
9
|
from bs4 import BeautifulSoup
|
|
12
|
-
from papermage import Document
|
|
13
10
|
|
|
14
11
|
from palimpzest import constants
|
|
15
12
|
from palimpzest.core.lib.schemas import (
|
|
@@ -390,22 +387,8 @@ class PDFFileDirectoryReader(DirectoryReader):
|
|
|
390
387
|
with open(filepath, "rb") as f:
|
|
391
388
|
pdf_bytes = f.read()
|
|
392
389
|
|
|
393
|
-
if self.pdfprocessor == "modal":
|
|
394
|
-
print("handling PDF processing remotely")
|
|
395
|
-
remote_func = modal.Function.lookup("palimpzest.tools", "processPapermagePdf")
|
|
396
|
-
else:
|
|
397
|
-
remote_func = None
|
|
398
|
-
|
|
399
390
|
# generate text_content from PDF
|
|
400
|
-
|
|
401
|
-
doc_json_str = remote_func.remote([pdf_bytes])
|
|
402
|
-
docdict = json.loads(doc_json_str[0])
|
|
403
|
-
doc = Document.from_json(docdict)
|
|
404
|
-
text_content = ""
|
|
405
|
-
for p in doc.pages:
|
|
406
|
-
text_content += p.text
|
|
407
|
-
else:
|
|
408
|
-
text_content = get_text_from_pdf(pdf_filename, pdf_bytes, pdfprocessor=self.pdfprocessor, file_cache_dir=self.file_cache_dir)
|
|
391
|
+
text_content = get_text_from_pdf(pdf_filename, pdf_bytes, pdfprocessor=self.pdfprocessor, file_cache_dir=self.file_cache_dir)
|
|
409
392
|
|
|
410
393
|
# construct and return item
|
|
411
394
|
return {"filename": pdf_filename, "contents": pdf_bytes, "text_contents": text_content}
|
|
@@ -64,8 +64,7 @@ class RAGConvert(LLMConvert):
|
|
|
64
64
|
+ MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
|
|
65
65
|
)
|
|
66
66
|
|
|
67
|
-
# set refined estimate of cost per record
|
|
68
|
-
# assume quality multiplier is proportional to sqrt(sqrt(token_budget))
|
|
67
|
+
# set refined estimate of cost per record
|
|
69
68
|
naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
|
|
70
69
|
naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
|
|
71
70
|
naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
|
|
@@ -61,8 +61,7 @@ class SplitConvert(LLMConvert):
|
|
|
61
61
|
+ MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
|
|
62
62
|
)
|
|
63
63
|
|
|
64
|
-
# set refined estimate of cost per record
|
|
65
|
-
# assume quality multiplier is proportional to sqrt(sqrt(token_budget))
|
|
64
|
+
# set refined estimate of cost per record
|
|
66
65
|
naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
|
|
67
66
|
naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
|
|
68
67
|
naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
|
|
@@ -46,9 +46,6 @@ from palimpzest.query.optimizer.rules import (
|
|
|
46
46
|
from palimpzest.query.optimizer.rules import (
|
|
47
47
|
SplitConvertRule as _SplitConvertRule,
|
|
48
48
|
)
|
|
49
|
-
from palimpzest.query.optimizer.rules import (
|
|
50
|
-
TokenReducedConvertBondedRule as _TokenReducedConvertBondedRule,
|
|
51
|
-
)
|
|
52
49
|
from palimpzest.query.optimizer.rules import (
|
|
53
50
|
TransformationRule as _TransformationRule,
|
|
54
51
|
)
|
|
@@ -70,7 +67,6 @@ ALL_RULES = [
|
|
|
70
67
|
_RetrieveRule,
|
|
71
68
|
_Rule,
|
|
72
69
|
_SplitConvertRule,
|
|
73
|
-
_TokenReducedConvertBondedRule,
|
|
74
70
|
_TransformationRule,
|
|
75
71
|
]
|
|
76
72
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import math
|
|
5
4
|
|
|
6
5
|
# NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
|
|
7
6
|
# answers to a convert with the same mode. This is because pandas tries to sort the answers
|
|
@@ -24,7 +23,6 @@ from palimpzest.query.operators.limit import LimitScanOp
|
|
|
24
23
|
from palimpzest.query.operators.physical import PhysicalOperator
|
|
25
24
|
from palimpzest.query.operators.rag_convert import RAGConvert
|
|
26
25
|
from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
|
|
27
|
-
from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
|
|
28
26
|
from palimpzest.utils.model_helpers import get_champion_model_name, get_models
|
|
29
27
|
|
|
30
28
|
warnings.simplefilter(action='ignore', category=UserWarning)
|
|
@@ -574,16 +572,6 @@ class CostModel(BaseCostModel):
|
|
|
574
572
|
op_estimates.cost_per_record = 1e-4
|
|
575
573
|
op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
|
|
576
574
|
|
|
577
|
-
# token reduction adjustment
|
|
578
|
-
if isinstance(operator, TokenReducedConvertBonded):
|
|
579
|
-
total_input_tokens = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens"]
|
|
580
|
-
total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
|
|
581
|
-
op_estimates.cost_per_record = (
|
|
582
|
-
MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
|
|
583
|
-
+ MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
|
|
584
|
-
)
|
|
585
|
-
op_estimates.quality = op_estimates.quality * math.sqrt(math.sqrt(operator.token_budget))
|
|
586
|
-
|
|
587
575
|
# rag convert adjustment
|
|
588
576
|
if isinstance(operator, RAGConvert):
|
|
589
577
|
total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
|
|
@@ -34,7 +34,6 @@ from palimpzest.query.optimizer.rules import (
|
|
|
34
34
|
MixtureOfAgentsConvertRule,
|
|
35
35
|
RAGConvertRule,
|
|
36
36
|
SplitConvertRule,
|
|
37
|
-
TokenReducedConvertBondedRule,
|
|
38
37
|
)
|
|
39
38
|
from palimpzest.query.optimizer.tasks import (
|
|
40
39
|
ApplyRule,
|
|
@@ -90,7 +89,6 @@ class Optimizer:
|
|
|
90
89
|
verbose: bool = False,
|
|
91
90
|
allow_bonded_query: bool = True,
|
|
92
91
|
allow_code_synth: bool = False,
|
|
93
|
-
allow_token_reduction: bool = False,
|
|
94
92
|
allow_rag_reduction: bool = False,
|
|
95
93
|
allow_mixtures: bool = True,
|
|
96
94
|
allow_critic: bool = False,
|
|
@@ -134,7 +132,6 @@ class Optimizer:
|
|
|
134
132
|
if optimizer_strategy == OptimizationStrategyType.NONE:
|
|
135
133
|
self.allow_bonded_query = True
|
|
136
134
|
self.allow_code_synth = False
|
|
137
|
-
self.allow_token_reduction = False
|
|
138
135
|
self.allow_rag_reduction = False
|
|
139
136
|
self.allow_mixtures = False
|
|
140
137
|
self.allow_critic = False
|
|
@@ -147,7 +144,6 @@ class Optimizer:
|
|
|
147
144
|
self.available_models = available_models
|
|
148
145
|
self.allow_bonded_query = allow_bonded_query
|
|
149
146
|
self.allow_code_synth = allow_code_synth
|
|
150
|
-
self.allow_token_reduction = allow_token_reduction
|
|
151
147
|
self.allow_rag_reduction = allow_rag_reduction
|
|
152
148
|
self.allow_mixtures = allow_mixtures
|
|
153
149
|
self.allow_critic = allow_critic
|
|
@@ -160,7 +156,7 @@ class Optimizer:
|
|
|
160
156
|
self.implementation_rules = [
|
|
161
157
|
rule
|
|
162
158
|
for rule in self.implementation_rules
|
|
163
|
-
if rule not in [LLMConvertBondedRule
|
|
159
|
+
if rule not in [LLMConvertBondedRule]
|
|
164
160
|
]
|
|
165
161
|
|
|
166
162
|
if not self.allow_code_synth:
|
|
@@ -168,11 +164,6 @@ class Optimizer:
|
|
|
168
164
|
rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
|
|
169
165
|
]
|
|
170
166
|
|
|
171
|
-
if not self.allow_token_reduction:
|
|
172
|
-
self.implementation_rules = [
|
|
173
|
-
rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertBondedRule)
|
|
174
|
-
]
|
|
175
|
-
|
|
176
167
|
if not self.allow_rag_reduction:
|
|
177
168
|
self.implementation_rules = [
|
|
178
169
|
rule for rule in self.implementation_rules if not issubclass(rule, RAGConvertRule)
|
|
@@ -218,7 +209,6 @@ class Optimizer:
|
|
|
218
209
|
available_models=self.available_models,
|
|
219
210
|
allow_bonded_query=self.allow_bonded_query,
|
|
220
211
|
allow_code_synth=self.allow_code_synth,
|
|
221
|
-
allow_token_reduction=self.allow_token_reduction,
|
|
222
212
|
allow_rag_reduction=self.allow_rag_reduction,
|
|
223
213
|
allow_mixtures=self.allow_mixtures,
|
|
224
214
|
allow_critic=self.allow_critic,
|
|
@@ -28,7 +28,6 @@ from palimpzest.query.operators.rag_convert import RAGConvert
|
|
|
28
28
|
from palimpzest.query.operators.retrieve import RetrieveOp
|
|
29
29
|
from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
|
|
30
30
|
from palimpzest.query.operators.split_convert import SplitConvert
|
|
31
|
-
from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
|
|
32
31
|
from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
|
|
33
32
|
from palimpzest.utils.model_helpers import get_models, get_vision_models
|
|
34
33
|
|
|
@@ -352,81 +351,6 @@ class LLMConvertBondedRule(ImplementationRule):
|
|
|
352
351
|
return deduped_physical_expressions
|
|
353
352
|
|
|
354
353
|
|
|
355
|
-
class TokenReducedConvertBondedRule(ImplementationRule):
|
|
356
|
-
"""
|
|
357
|
-
Substitute a logical expression for a ConvertScan with a bonded token reduced physical implementation.
|
|
358
|
-
"""
|
|
359
|
-
|
|
360
|
-
token_budgets = [0.1, 0.5, 0.9]
|
|
361
|
-
|
|
362
|
-
@classmethod
|
|
363
|
-
def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
|
|
364
|
-
logical_op = logical_expression.operator
|
|
365
|
-
is_image_conversion = any(
|
|
366
|
-
[
|
|
367
|
-
field.is_image_field
|
|
368
|
-
for field_name, field in logical_expression.input_fields.items()
|
|
369
|
-
if field_name.split(".")[-1] in logical_expression.depends_on_field_names
|
|
370
|
-
]
|
|
371
|
-
)
|
|
372
|
-
is_match = isinstance(logical_op, ConvertScan) and not is_image_conversion and logical_op.udf is None
|
|
373
|
-
logger.debug(f"TokenReducedConvertBondedRule matches_pattern: {is_match} for {logical_expression}")
|
|
374
|
-
return is_match
|
|
375
|
-
|
|
376
|
-
@classmethod
|
|
377
|
-
def substitute(cls, logical_expression: LogicalExpression, **physical_op_params) -> set[PhysicalExpression]:
|
|
378
|
-
logger.debug(f"Substituting TokenReducedConvertBondedRule for {logical_expression}")
|
|
379
|
-
|
|
380
|
-
logical_op = logical_expression.operator
|
|
381
|
-
|
|
382
|
-
# get initial set of parameters for physical op
|
|
383
|
-
op_kwargs = logical_op.get_logical_op_params()
|
|
384
|
-
op_kwargs.update(
|
|
385
|
-
{
|
|
386
|
-
"verbose": physical_op_params["verbose"],
|
|
387
|
-
"logical_op_id": logical_op.get_logical_op_id(),
|
|
388
|
-
"logical_op_name": logical_op.logical_op_name(),
|
|
389
|
-
}
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
# NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
|
|
393
|
-
# thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
|
|
394
|
-
#
|
|
395
|
-
# identify models which can be used strictly for text or strictly for images
|
|
396
|
-
vision_models = set(get_vision_models())
|
|
397
|
-
text_models = set(get_models())
|
|
398
|
-
pure_vision_models = {model for model in vision_models if model not in text_models}
|
|
399
|
-
|
|
400
|
-
physical_expressions = []
|
|
401
|
-
for model in physical_op_params["available_models"]:
|
|
402
|
-
for token_budget in cls.token_budgets:
|
|
403
|
-
# skip this model if this is a pure image model
|
|
404
|
-
if model in pure_vision_models:
|
|
405
|
-
continue
|
|
406
|
-
|
|
407
|
-
# construct multi-expression
|
|
408
|
-
op = TokenReducedConvertBonded(
|
|
409
|
-
model=model,
|
|
410
|
-
prompt_strategy=PromptStrategy.COT_QA,
|
|
411
|
-
token_budget=token_budget,
|
|
412
|
-
**op_kwargs,
|
|
413
|
-
)
|
|
414
|
-
expression = PhysicalExpression(
|
|
415
|
-
operator=op,
|
|
416
|
-
input_group_ids=logical_expression.input_group_ids,
|
|
417
|
-
input_fields=logical_expression.input_fields,
|
|
418
|
-
depends_on_field_names=logical_expression.depends_on_field_names,
|
|
419
|
-
generated_fields=logical_expression.generated_fields,
|
|
420
|
-
group_id=logical_expression.group_id,
|
|
421
|
-
)
|
|
422
|
-
physical_expressions.append(expression)
|
|
423
|
-
|
|
424
|
-
logger.debug(f"Done substituting TokenReducedConvertBondedRule for {logical_expression}")
|
|
425
|
-
deduped_physical_expressions = set(physical_expressions)
|
|
426
|
-
|
|
427
|
-
return deduped_physical_expressions
|
|
428
|
-
|
|
429
|
-
|
|
430
354
|
class CodeSynthesisConvertRule(ImplementationRule):
|
|
431
355
|
"""
|
|
432
356
|
Base rule for code synthesis convert operators; the physical convert class
|
|
@@ -31,7 +31,6 @@ class QueryProcessorConfig:
|
|
|
31
31
|
allow_bonded_query: bool = field(default=True)
|
|
32
32
|
allow_model_selection: bool = field(default=True)
|
|
33
33
|
allow_code_synth: bool = field(default=False)
|
|
34
|
-
allow_token_reduction: bool = field(default=False)
|
|
35
34
|
allow_rag_reduction: bool = field(default=True)
|
|
36
35
|
allow_mixtures: bool = field(default=True)
|
|
37
36
|
allow_critic: bool = field(default=True)
|
|
@@ -59,7 +58,6 @@ class QueryProcessorConfig:
|
|
|
59
58
|
"allow_bonded_query": self.allow_bonded_query,
|
|
60
59
|
"allow_model_selection": self.allow_model_selection,
|
|
61
60
|
"allow_code_synth": self.allow_code_synth,
|
|
62
|
-
"allow_token_reduction": self.allow_token_reduction,
|
|
63
61
|
"allow_rag_reduction": self.allow_rag_reduction,
|
|
64
62
|
"allow_mixtures": self.allow_mixtures,
|
|
65
63
|
"allow_critic": self.allow_critic,
|
|
@@ -47,8 +47,6 @@ def create_plan_str(flatten_ops):
|
|
|
47
47
|
else str(right.filter.filter_fn)
|
|
48
48
|
)
|
|
49
49
|
plan_str += f'\n Filter: "{filter_str}"'
|
|
50
|
-
if hasattr(right, "token_budget"):
|
|
51
|
-
plan_str += f"\n Token budget: {right.token_budget}"
|
|
52
50
|
plan_str += "\n"
|
|
53
51
|
plan_str += (
|
|
54
52
|
f" ({','.join(in_schema.field_names())[:15]}...) -> ({','.join(out_schema.field_names())[:15]}...)"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: palimpzest
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.3
|
|
4
4
|
Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
|
|
5
5
|
Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
|
|
6
6
|
Project-URL: homepage, https://palimpzest.org
|
|
@@ -15,45 +15,25 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
15
15
|
Requires-Python: >=3.8
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
License-File: LICENSE
|
|
18
|
-
Requires-Dist: charset-normalizer>=3.3.2
|
|
19
18
|
Requires-Dist: chromadb>=0.6.3
|
|
20
|
-
Requires-Dist: click>=8.1.7
|
|
21
|
-
Requires-Dist: click-aliases>=1.0.4
|
|
22
|
-
Requires-Dist: colorama>=0.4.6
|
|
23
19
|
Requires-Dist: fastapi~=0.115.0
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: google-generativeai>=0.8.0
|
|
26
|
-
Requires-Dist: gradio>=4.20.1
|
|
27
|
-
Requires-Dist: grobid-client-python==0.0.5
|
|
28
|
-
Requires-Dist: ipython>=8.26.0
|
|
29
|
-
Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
|
|
30
|
-
Requires-Dist: jupyter
|
|
31
|
-
Requires-Dist: layoutparser>=0.3.4
|
|
32
|
-
Requires-Dist: lxml-html-clean>=0.1.1
|
|
20
|
+
Requires-Dist: gradio>=5.26.0
|
|
33
21
|
Requires-Dist: mkdocs>=1.6.1
|
|
34
22
|
Requires-Dist: mkdocs-material>=9.6.3
|
|
35
23
|
Requires-Dist: mkdocs-material[imaging]
|
|
36
24
|
Requires-Dist: mkdocstrings-python>=1.15.0
|
|
37
|
-
Requires-Dist: modal>=0.62.198
|
|
38
|
-
Requires-Dist: ncls==0.0.68
|
|
39
|
-
Requires-Dist: necessary>=0.3.2
|
|
40
25
|
Requires-Dist: numpy>=1.23.2
|
|
41
26
|
Requires-Dist: openai>=1.0
|
|
42
|
-
Requires-Dist: openpyxl==3.1.2
|
|
43
27
|
Requires-Dist: pandas>=2.1.1
|
|
44
|
-
Requires-Dist: papermage>=0.16.0
|
|
45
|
-
Requires-Dist: pdf2image
|
|
46
28
|
Requires-Dist: pytest>=8.2.2
|
|
47
|
-
Requires-Dist:
|
|
48
|
-
Requires-Dist: pdfplumber==0.7.4
|
|
49
|
-
Requires-Dist: pillow>=10.2.0
|
|
29
|
+
Requires-Dist: pillow
|
|
50
30
|
Requires-Dist: prettytable>=3.9.0
|
|
31
|
+
Requires-Dist: psutil>=7.0.0
|
|
51
32
|
Requires-Dist: PyLD>=2.0.4
|
|
52
33
|
Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
|
|
53
34
|
Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
|
|
54
35
|
Requires-Dist: pypdf>=5.1.0
|
|
55
36
|
Requires-Dist: pytest-mock>=3.14.0
|
|
56
|
-
Requires-Dist: python-Levenshtein>=0.25.1
|
|
57
37
|
Requires-Dist: pyyaml>=6.0.1
|
|
58
38
|
Requires-Dist: ragatouille>=0.0.9
|
|
59
39
|
Requires-Dist: requests>=2.25
|
|
@@ -64,7 +44,6 @@ Requires-Dist: together>=1.3.1
|
|
|
64
44
|
Requires-Dist: tqdm~=4.66.1
|
|
65
45
|
Requires-Dist: transformers<4.50.0,>=4.41.3
|
|
66
46
|
Requires-Dist: rich[jupyter]>=13.9.2
|
|
67
|
-
Requires-Dist: voyager>=2.0.9
|
|
68
47
|
Dynamic: license-file
|
|
69
48
|
|
|
70
49
|

|
|
@@ -60,7 +60,6 @@ src/palimpzest/query/operators/rag_convert.py
|
|
|
60
60
|
src/palimpzest/query/operators/retrieve.py
|
|
61
61
|
src/palimpzest/query/operators/scan.py
|
|
62
62
|
src/palimpzest/query/operators/split_convert.py
|
|
63
|
-
src/palimpzest/query/operators/token_reduction_convert.py
|
|
64
63
|
src/palimpzest/query/optimizer/__init__.py
|
|
65
64
|
src/palimpzest/query/optimizer/cost_model.py
|
|
66
65
|
src/palimpzest/query/optimizer/optimizer.py
|
|
@@ -95,5 +94,4 @@ src/palimpzest/utils/hash_helpers.py
|
|
|
95
94
|
src/palimpzest/utils/model_helpers.py
|
|
96
95
|
src/palimpzest/utils/progress.py
|
|
97
96
|
src/palimpzest/utils/sandbox.py
|
|
98
|
-
src/palimpzest/utils/token_reduction_helpers.py
|
|
99
97
|
src/palimpzest/utils/udfs.py
|
|
@@ -1,40 +1,20 @@
|
|
|
1
|
-
charset-normalizer>=3.3.2
|
|
2
1
|
chromadb>=0.6.3
|
|
3
|
-
click>=8.1.7
|
|
4
|
-
click-aliases>=1.0.4
|
|
5
|
-
colorama>=0.4.6
|
|
6
2
|
fastapi~=0.115.0
|
|
7
|
-
|
|
8
|
-
google-generativeai>=0.8.0
|
|
9
|
-
gradio>=4.20.1
|
|
10
|
-
grobid-client-python==0.0.5
|
|
11
|
-
ipython>=8.26.0
|
|
12
|
-
opencv-python-headless<4.9.0,>=4.8.0
|
|
13
|
-
jupyter
|
|
14
|
-
layoutparser>=0.3.4
|
|
15
|
-
lxml-html-clean>=0.1.1
|
|
3
|
+
gradio>=5.26.0
|
|
16
4
|
mkdocs>=1.6.1
|
|
17
5
|
mkdocs-material>=9.6.3
|
|
18
6
|
mkdocs-material[imaging]
|
|
19
7
|
mkdocstrings-python>=1.15.0
|
|
20
|
-
modal>=0.62.198
|
|
21
|
-
ncls==0.0.68
|
|
22
|
-
necessary>=0.3.2
|
|
23
8
|
numpy>=1.23.2
|
|
24
9
|
openai>=1.0
|
|
25
|
-
openpyxl==3.1.2
|
|
26
10
|
pandas>=2.1.1
|
|
27
|
-
papermage>=0.16.0
|
|
28
|
-
pdf2image
|
|
29
11
|
pytest>=8.2.2
|
|
30
|
-
|
|
31
|
-
pdfplumber==0.7.4
|
|
32
|
-
pillow>=10.2.0
|
|
12
|
+
pillow
|
|
33
13
|
prettytable>=3.9.0
|
|
14
|
+
psutil>=7.0.0
|
|
34
15
|
PyLD>=2.0.4
|
|
35
16
|
pypdf>=5.1.0
|
|
36
17
|
pytest-mock>=3.14.0
|
|
37
|
-
python-Levenshtein>=0.25.1
|
|
38
18
|
pyyaml>=6.0.1
|
|
39
19
|
ragatouille>=0.0.9
|
|
40
20
|
requests>=2.25
|
|
@@ -45,7 +25,6 @@ together>=1.3.1
|
|
|
45
25
|
tqdm~=4.66.1
|
|
46
26
|
transformers<4.50.0,>=4.41.3
|
|
47
27
|
rich[jupyter]>=13.9.2
|
|
48
|
-
voyager>=2.0.9
|
|
49
28
|
|
|
50
29
|
[:python_version < "3.12"]
|
|
51
30
|
pyarrow<15.0.0,>=13.0.0
|
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import math
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
from palimpzest.constants import (
|
|
7
|
-
MODEL_CARDS,
|
|
8
|
-
NAIVE_EST_NUM_INPUT_TOKENS,
|
|
9
|
-
NAIVE_EST_NUM_OUTPUT_TOKENS,
|
|
10
|
-
)
|
|
11
|
-
from palimpzest.core.data.dataclasses import OperatorCostEstimates
|
|
12
|
-
from palimpzest.query.operators.convert import LLMConvertBonded
|
|
13
|
-
from palimpzest.utils.token_reduction_helpers import best_substring_match, find_best_range
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# NOTE: this convert operation will not work with the new generation abstraction, and it needs to be worked on.
|
|
17
|
-
# There are two minor issues with the operator as it exists:
|
|
18
|
-
#
|
|
19
|
-
# 1) The token reduction operation operated over the entire JSON string of the input DataRecord
|
|
20
|
-
# - while this works in practice, it makes it difficult to use this operator with a generation framework
|
|
21
|
-
# where each field may be placed in a specific place in the format string for a prompt
|
|
22
|
-
# - we need to either (A) rewrite the reduction to take place on a field-by-field basis (or at least
|
|
23
|
-
# make it possible to recover each field after a global reduction) or (B) add custom logic within
|
|
24
|
-
# the Generator class(es) to handle this operator [I much prefer (A) over (B)]
|
|
25
|
-
#
|
|
26
|
-
# 2) The heatmap update logic does not translate well to the distributed setting, where this operator may
|
|
27
|
-
# be copied and executed many times in parallel
|
|
28
|
-
# - each copy of the operator will have its own heatmap and require MAX_HEATMAP_UPDATES just to enter the
|
|
29
|
-
# phase where token reduction takes place
|
|
30
|
-
# - this means that if we have 20-way parallelism and a MAX_HEATMAP_UPDATES = 5, it can take 100 inputs
|
|
31
|
-
# before token reduction ever takes place
|
|
32
|
-
# - this also creates difficulties in properly performing cost-estimation for this operator; e.g. if we use
|
|
33
|
-
# n <= MAX_HEATMAP_UPDATES samples to cost this operator, then we will never actually measure its performance
|
|
34
|
-
# in the token reduction phase -- which could have a serious degradation in quality that our optimizer doesn't see
|
|
35
|
-
class TokenReducedConvertBonded(LLMConvertBonded):
|
|
36
|
-
# NOTE: moving these closer to the TokenReducedConvertBonded class for now (in part to make
|
|
37
|
-
# them easier to mock); we can make these parameterized as well
|
|
38
|
-
MAX_HEATMAP_UPDATES: int = 5
|
|
39
|
-
TOKEN_REDUCTION_SAMPLE: int = 0
|
|
40
|
-
TOKEN_REDUCTION_GRANULARITY: float = 0.001
|
|
41
|
-
|
|
42
|
-
def __init__(self, token_budget: float, *args, **kwargs):
|
|
43
|
-
super().__init__(*args, **kwargs)
|
|
44
|
-
self.token_budget = token_budget
|
|
45
|
-
self.resolution = self.TOKEN_REDUCTION_GRANULARITY
|
|
46
|
-
self.first_execution = True
|
|
47
|
-
self.count = 0
|
|
48
|
-
self.heatmap = [0] * int(1.0 / self.resolution)
|
|
49
|
-
|
|
50
|
-
def __str__(self):
|
|
51
|
-
op = super().__str__()
|
|
52
|
-
op += f" Token Budget: {str(self.token_budget)}\n"
|
|
53
|
-
return op
|
|
54
|
-
|
|
55
|
-
def get_id_params(self):
|
|
56
|
-
id_params = super().get_id_params()
|
|
57
|
-
id_params = {"token_budget": self.token_budget, **id_params}
|
|
58
|
-
|
|
59
|
-
return id_params
|
|
60
|
-
|
|
61
|
-
def get_op_params(self):
|
|
62
|
-
op_params = super().get_op_params()
|
|
63
|
-
return {"token_budget": self.token_budget, **op_params}
|
|
64
|
-
|
|
65
|
-
def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
|
|
66
|
-
"""
|
|
67
|
-
Update the cost per record and quality estimates produced by LLMConvert's naive estimates.
|
|
68
|
-
We adjust the cost per record to account for the reduced number of input tokens following
|
|
69
|
-
token reduction, and we make a crude estimate of the quality degradation that results from
|
|
70
|
-
using fewer tokens.
|
|
71
|
-
"""
|
|
72
|
-
# get naive cost estimates from LLMConvert
|
|
73
|
-
naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
|
|
74
|
-
|
|
75
|
-
# re-compute cost per record assuming we use fewer input tokens
|
|
76
|
-
est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * self.token_budget
|
|
77
|
-
est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
|
|
78
|
-
model_conversion_usd_per_record = (
|
|
79
|
-
MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
|
|
80
|
-
+ MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
# set refined estimate of cost per record and, for now,
|
|
84
|
-
# assume quality multiplier is proportional to sqrt(sqrt(token_budget))
|
|
85
|
-
naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
|
|
86
|
-
naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
|
|
87
|
-
naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
|
|
88
|
-
naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * math.sqrt(math.sqrt(self.token_budget))
|
|
89
|
-
naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
|
|
90
|
-
naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
|
|
91
|
-
|
|
92
|
-
return naive_op_cost_estimates
|
|
93
|
-
|
|
94
|
-
def is_image_conversion(self) -> bool:
|
|
95
|
-
"""TokenReducedConvertBonded is currently disallowed on image conversions, so this must be False."""
|
|
96
|
-
return False
|
|
97
|
-
|
|
98
|
-
def reduce_context(self, full_context: str) -> str:
|
|
99
|
-
range = find_best_range(
|
|
100
|
-
self.heatmap,
|
|
101
|
-
int(self.token_budget / self.TOKEN_REDUCTION_GRANULARITY),
|
|
102
|
-
trim_zeros=False,
|
|
103
|
-
)
|
|
104
|
-
if not range:
|
|
105
|
-
raise Exception("No range found in heatmap")
|
|
106
|
-
si, ei = range
|
|
107
|
-
print("si:", si, "ei:", ei)
|
|
108
|
-
sr, er = (
|
|
109
|
-
si * self.TOKEN_REDUCTION_GRANULARITY,
|
|
110
|
-
ei * self.TOKEN_REDUCTION_GRANULARITY,
|
|
111
|
-
)
|
|
112
|
-
test_len = len(full_context)
|
|
113
|
-
start = int(sr * test_len)
|
|
114
|
-
end = int(er * test_len)
|
|
115
|
-
if self.verbose:
|
|
116
|
-
print(f"start ratio: {sr} -- end ratio: {er}")
|
|
117
|
-
print("character start:", start, "end:", end)
|
|
118
|
-
sample = full_context[start:end]
|
|
119
|
-
return sample
|
|
120
|
-
|
|
121
|
-
def _dspy_generate_fields(self, prompt: str, content: str | list[str]) -> tuple[list[dict[str, list]] | Any]:
|
|
122
|
-
raise Exception(
|
|
123
|
-
"TokenReducedConvertBonded is executing despite being deprecated until implementation changes can be made."
|
|
124
|
-
)
|
|
125
|
-
answer, query_stats = None, None
|
|
126
|
-
if self.first_execution or self.count < self.MAX_HEATMAP_UPDATES:
|
|
127
|
-
if self.verbose:
|
|
128
|
-
print("Warming up heatmap")
|
|
129
|
-
answer, query_stats = super()._dspy_generate_fields(prompt, content)
|
|
130
|
-
self.first_execution = False
|
|
131
|
-
|
|
132
|
-
else:
|
|
133
|
-
if self.verbose:
|
|
134
|
-
print("Using heatmap")
|
|
135
|
-
|
|
136
|
-
# only refer to the heatmap if the count is greater than a enough sample size
|
|
137
|
-
# TODO: only trim the context if the attention is clustered in a small region
|
|
138
|
-
if self.count >= self.TOKEN_REDUCTION_SAMPLE:
|
|
139
|
-
context = self.reduce_context(content)
|
|
140
|
-
try:
|
|
141
|
-
answer, _, query_stats = self.generator.generate(context=context, prompt=prompt)
|
|
142
|
-
except Exception as e:
|
|
143
|
-
print(f"DSPy generation error: {e}, falling back to unreduced generation")
|
|
144
|
-
answer, query_stats = super()._dspy_generate_fields(prompt, content)
|
|
145
|
-
|
|
146
|
-
# TODO: answer and query stats may be unbound if we hit the else block
|
|
147
|
-
# and count < TOKEN_REDUCTION_SAMPLE, which makes the below pretty clunky
|
|
148
|
-
# this throw asserts our view of the world and we should refactor this
|
|
149
|
-
if answer is None or query_stats is None:
|
|
150
|
-
raise Exception("answer or query_stats is None")
|
|
151
|
-
try:
|
|
152
|
-
match = best_substring_match(answer, content)
|
|
153
|
-
if not match:
|
|
154
|
-
gsi, gei = 0, len(content)
|
|
155
|
-
else:
|
|
156
|
-
gsi, gei = match
|
|
157
|
-
except Exception as e:
|
|
158
|
-
print("Error in substring match:", e)
|
|
159
|
-
gsi, gei = 0, len(content)
|
|
160
|
-
context_len = len(content)
|
|
161
|
-
gsr, ger = gsi / context_len, gei / context_len
|
|
162
|
-
norm_si, norm_ei = int(gsr / self.resolution), int(ger / self.resolution)
|
|
163
|
-
if self.verbose:
|
|
164
|
-
print(f"best_start: {gsi} -- best_end: {gei}")
|
|
165
|
-
|
|
166
|
-
self.count += 1
|
|
167
|
-
self.heatmap[norm_si:norm_ei] = map(lambda x: x + 1, self.heatmap[norm_si:norm_ei])
|
|
168
|
-
|
|
169
|
-
return answer, query_stats
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
from fuzzywuzzy import fuzz, process
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def find_best_range(values, budget, trim_zeros=False):
|
|
5
|
-
"""
|
|
6
|
-
Finds the consecutive range with the biggest sum within a budget.
|
|
7
|
-
|
|
8
|
-
Args:
|
|
9
|
-
values: A list of non-negative numbers.
|
|
10
|
-
budget: The maximum number of consecutive elements to consider.
|
|
11
|
-
|
|
12
|
-
Returns:
|
|
13
|
-
A tuple containing the start and end indices (inclusive) of the best range,
|
|
14
|
-
or None if the array is empty.
|
|
15
|
-
"""
|
|
16
|
-
if not values:
|
|
17
|
-
return None
|
|
18
|
-
|
|
19
|
-
n = len(values)
|
|
20
|
-
best_sum, best_start, current_sum, current_start = 0, 0, 0, 0
|
|
21
|
-
|
|
22
|
-
# Iterate through the array, keeping track of current and best ranges.
|
|
23
|
-
for i in range(n):
|
|
24
|
-
current_sum += values[i]
|
|
25
|
-
|
|
26
|
-
# If the current range exceeds the budget, remove elements from the beginning.
|
|
27
|
-
while current_start + budget - 1 < i and current_start + budget - 1 >= 0:
|
|
28
|
-
current_sum -= values[current_start]
|
|
29
|
-
current_start += 1
|
|
30
|
-
|
|
31
|
-
# Update best range if the current sum is bigger.
|
|
32
|
-
if current_sum > best_sum:
|
|
33
|
-
best_sum = current_sum
|
|
34
|
-
best_start = current_start
|
|
35
|
-
|
|
36
|
-
best_end = best_start + budget - 1
|
|
37
|
-
print("best_start:", best_start, "best_end:", best_end)
|
|
38
|
-
if trim_zeros:
|
|
39
|
-
# Trim leading/trailing zeros
|
|
40
|
-
while best_start >= 0 and values[best_start] == 0:
|
|
41
|
-
best_start += 1
|
|
42
|
-
|
|
43
|
-
while best_end < n and values[best_end] == 0:
|
|
44
|
-
best_end -= 1
|
|
45
|
-
else:
|
|
46
|
-
# balance the zero entries equally on both sides
|
|
47
|
-
leading_zeros = 0
|
|
48
|
-
trailing_zeros = 0
|
|
49
|
-
start_idx = best_start
|
|
50
|
-
end_idx = best_end
|
|
51
|
-
while start_idx >= 0 and values[start_idx] == 0:
|
|
52
|
-
leading_zeros += 1
|
|
53
|
-
start_idx += 1
|
|
54
|
-
while end_idx < n and values[end_idx] == 0:
|
|
55
|
-
trailing_zeros += 1
|
|
56
|
-
end_idx -= 1
|
|
57
|
-
half_zeros = int((leading_zeros + trailing_zeros) / 2)
|
|
58
|
-
print("leading_zeros:", leading_zeros, "trailing_zeros:", trailing_zeros, "half_zeros:", half_zeros)
|
|
59
|
-
best_start = best_start - half_zeros + leading_zeros
|
|
60
|
-
best_end = best_end - trailing_zeros + leading_zeros + trailing_zeros - half_zeros
|
|
61
|
-
|
|
62
|
-
if best_start < 0:
|
|
63
|
-
best_end = best_end - best_start
|
|
64
|
-
best_start = 0
|
|
65
|
-
if best_end >= n:
|
|
66
|
-
best_start = best_start - (best_end - n + 1)
|
|
67
|
-
best_end = n - 1
|
|
68
|
-
|
|
69
|
-
return best_start, best_end + 1
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def get_range_from_hist(file_path, range_budget, resolution=0.001, trim_zeros=True):
|
|
73
|
-
# Load data from csv file and extract he second column as values
|
|
74
|
-
values = []
|
|
75
|
-
with open(file_path) as file:
|
|
76
|
-
for line in file:
|
|
77
|
-
line = line.strip()
|
|
78
|
-
values.append(int(float(line.split(",")[1])))
|
|
79
|
-
index_range = 1 / resolution
|
|
80
|
-
budget = int(range_budget * index_range)
|
|
81
|
-
# Find the best range
|
|
82
|
-
range = find_best_range(values, budget, trim_zeros=trim_zeros)
|
|
83
|
-
if not range:
|
|
84
|
-
raise ValueError("No range found")
|
|
85
|
-
start, end = range
|
|
86
|
-
print("start:", start, "end:", end, "index_range:", index_range)
|
|
87
|
-
return start * 1.0 / index_range, end * 1.0 / index_range
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def best_substring_match(query: str, context: str | list[str]):
|
|
91
|
-
# This will extract all substrings of length equal to the query from the string
|
|
92
|
-
candidates = [context[i : i + len(query)] for i in range(len(context) - len(query) + 1)]
|
|
93
|
-
|
|
94
|
-
# Find the best match among the candidates
|
|
95
|
-
ret = process.extractOne(query, candidates, scorer=fuzz.ratio)
|
|
96
|
-
if ret is None:
|
|
97
|
-
return None
|
|
98
|
-
|
|
99
|
-
best_match, score = ret
|
|
100
|
-
positions = [can == best_match for can in candidates]
|
|
101
|
-
start = positions.index(True)
|
|
102
|
-
end = start + len(query)
|
|
103
|
-
# print("best match:", best_match, "score:", score, "start:", start, "end:", end)
|
|
104
|
-
# print("-------", string[start:end])
|
|
105
|
-
return start, end
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/critique_and_refine_convert_prompts.py
RENAMED
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/moa_aggregator_convert_prompts.py
RENAMED
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/moa_proposer_convert_prompts.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/execution_strategy_type.py
RENAMED
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/mab_execution_strategy.py
RENAMED
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/parallel_execution_strategy.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/code_synthesis_convert.py
RENAMED
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/critique_and_refine_convert.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/mixture_of_agents_convert.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer_strategy_type.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/nosentinel_processor.py
RENAMED
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/processing_strategy_type.py
RENAMED
|
File without changes
|
|
File without changes
|
{palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/query_processor_factory.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|