palimpzest 0.7.1__tar.gz → 0.7.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. {palimpzest-0.7.1/src/palimpzest.egg-info → palimpzest-0.7.3}/PKG-INFO +4 -25
  2. {palimpzest-0.7.1 → palimpzest-0.7.3}/pyproject.toml +4 -25
  3. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/datareaders.py +1 -18
  4. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/rag_convert.py +1 -2
  5. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/split_convert.py +1 -2
  6. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/__init__.py +0 -4
  7. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/cost_model.py +0 -12
  8. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer.py +1 -11
  9. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/rules.py +0 -76
  10. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/config.py +0 -2
  11. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/demo_helpers.py +0 -2
  12. {palimpzest-0.7.1 → palimpzest-0.7.3/src/palimpzest.egg-info}/PKG-INFO +4 -25
  13. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/SOURCES.txt +0 -2
  14. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/requires.txt +3 -24
  15. palimpzest-0.7.1/src/palimpzest/query/operators/token_reduction_convert.py +0 -169
  16. palimpzest-0.7.1/src/palimpzest/utils/token_reduction_helpers.py +0 -105
  17. {palimpzest-0.7.1 → palimpzest-0.7.3}/LICENSE +0 -0
  18. {palimpzest-0.7.1 → palimpzest-0.7.3}/README.md +0 -0
  19. {palimpzest-0.7.1 → palimpzest-0.7.3}/setup.cfg +0 -0
  20. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/__init__.py +0 -0
  21. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/constants.py +0 -0
  22. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/__init__.py +0 -0
  23. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/__init__.py +0 -0
  24. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/data/dataclasses.py +0 -0
  25. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/__init__.py +0 -0
  26. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/filters.py +0 -0
  27. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/groupbysig.py +0 -0
  28. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/index.py +0 -0
  29. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/elements/records.py +0 -0
  30. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/lib/__init__.py +0 -0
  31. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/lib/fields.py +0 -0
  32. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/core/lib/schemas.py +0 -0
  33. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/policy.py +0 -0
  34. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/__init__.py +0 -0
  35. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/code_synthesis_prompts.py +0 -0
  36. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/convert_prompts.py +0 -0
  37. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/critique_and_refine_convert_prompts.py +0 -0
  38. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/filter_prompts.py +0 -0
  39. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/moa_aggregator_convert_prompts.py +0 -0
  40. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/moa_proposer_convert_prompts.py +0 -0
  41. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/prompt_factory.py +0 -0
  42. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/split_merge_prompts.py +0 -0
  43. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/split_proposer_prompts.py +0 -0
  44. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/prompts/util_phrases.py +0 -0
  45. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/__init__.py +0 -0
  46. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/__init__.py +0 -0
  47. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/execution_strategy.py +0 -0
  48. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/execution_strategy_type.py +0 -0
  49. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/mab_execution_strategy.py +0 -0
  50. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/parallel_execution_strategy.py +0 -0
  51. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/random_sampling_execution_strategy.py +0 -0
  52. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/execution/single_threaded_execution_strategy.py +0 -0
  53. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/generators/__init__.py +0 -0
  54. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/generators/api_client_factory.py +0 -0
  55. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/generators/generators.py +0 -0
  56. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/__init__.py +0 -0
  57. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/aggregate.py +0 -0
  58. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/code_synthesis_convert.py +0 -0
  59. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/convert.py +0 -0
  60. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/critique_and_refine_convert.py +0 -0
  61. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/filter.py +0 -0
  62. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/limit.py +0 -0
  63. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/logical.py +0 -0
  64. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/map.py +0 -0
  65. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/mixture_of_agents_convert.py +0 -0
  66. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/physical.py +0 -0
  67. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/project.py +0 -0
  68. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/retrieve.py +0 -0
  69. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/operators/scan.py +0 -0
  70. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer_strategy.py +0 -0
  71. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/optimizer_strategy_type.py +0 -0
  72. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/plan.py +0 -0
  73. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/primitives.py +0 -0
  74. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/optimizer/tasks.py +0 -0
  75. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/__init__.py +0 -0
  76. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/nosentinel_processor.py +0 -0
  77. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/processing_strategy_type.py +0 -0
  78. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/query_processor.py +0 -0
  79. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/query_processor_factory.py +0 -0
  80. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/sentinel_processor.py +0 -0
  81. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/query/processor/streaming_processor.py +0 -0
  82. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/schemabuilder/__init__.py +0 -0
  83. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/schemabuilder/schema_builder.py +0 -0
  84. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/sets.py +0 -0
  85. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/README.md +0 -0
  86. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/__init__.py +0 -0
  87. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/allenpdf.py +0 -0
  88. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/pdfparser.py +0 -0
  89. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/tools/skema_tools.py +0 -0
  90. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/__init__.py +0 -0
  91. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/datareader_helpers.py +0 -0
  92. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/env_helpers.py +0 -0
  93. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/field_helpers.py +0 -0
  94. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/generation_helpers.py +0 -0
  95. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/hash_helpers.py +0 -0
  96. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/model_helpers.py +0 -0
  97. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/progress.py +0 -0
  98. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/sandbox.py +0 -0
  99. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest/utils/udfs.py +0 -0
  100. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/dependency_links.txt +0 -0
  101. {palimpzest-0.7.1 → palimpzest-0.7.3}/src/palimpzest.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -15,45 +15,25 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Requires-Python: >=3.8
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: charset-normalizer>=3.3.2
19
18
  Requires-Dist: chromadb>=0.6.3
20
- Requires-Dist: click>=8.1.7
21
- Requires-Dist: click-aliases>=1.0.4
22
- Requires-Dist: colorama>=0.4.6
23
19
  Requires-Dist: fastapi~=0.115.0
24
- Requires-Dist: fuzzywuzzy>=0.18.0
25
- Requires-Dist: google-generativeai>=0.8.0
26
- Requires-Dist: gradio>=4.20.1
27
- Requires-Dist: grobid-client-python==0.0.5
28
- Requires-Dist: ipython>=8.26.0
29
- Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
30
- Requires-Dist: jupyter
31
- Requires-Dist: layoutparser>=0.3.4
32
- Requires-Dist: lxml-html-clean>=0.1.1
20
+ Requires-Dist: gradio>=5.26.0
33
21
  Requires-Dist: mkdocs>=1.6.1
34
22
  Requires-Dist: mkdocs-material>=9.6.3
35
23
  Requires-Dist: mkdocs-material[imaging]
36
24
  Requires-Dist: mkdocstrings-python>=1.15.0
37
- Requires-Dist: modal>=0.62.198
38
- Requires-Dist: ncls==0.0.68
39
- Requires-Dist: necessary>=0.3.2
40
25
  Requires-Dist: numpy>=1.23.2
41
26
  Requires-Dist: openai>=1.0
42
- Requires-Dist: openpyxl==3.1.2
43
27
  Requires-Dist: pandas>=2.1.1
44
- Requires-Dist: papermage>=0.16.0
45
- Requires-Dist: pdf2image
46
28
  Requires-Dist: pytest>=8.2.2
47
- Requires-Dist: python-Levenshtein
48
- Requires-Dist: pdfplumber==0.7.4
49
- Requires-Dist: pillow>=10.2.0
29
+ Requires-Dist: pillow
50
30
  Requires-Dist: prettytable>=3.9.0
31
+ Requires-Dist: psutil>=7.0.0
51
32
  Requires-Dist: PyLD>=2.0.4
52
33
  Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
53
34
  Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
54
35
  Requires-Dist: pypdf>=5.1.0
55
36
  Requires-Dist: pytest-mock>=3.14.0
56
- Requires-Dist: python-Levenshtein>=0.25.1
57
37
  Requires-Dist: pyyaml>=6.0.1
58
38
  Requires-Dist: ragatouille>=0.0.9
59
39
  Requires-Dist: requests>=2.25
@@ -64,7 +44,6 @@ Requires-Dist: together>=1.3.1
64
44
  Requires-Dist: tqdm~=4.66.1
65
45
  Requires-Dist: transformers<4.50.0,>=4.41.3
66
46
  Requires-Dist: rich[jupyter]>=13.9.2
67
- Requires-Dist: voyager>=2.0.9
68
47
  Dynamic: license-file
69
48
 
70
49
  ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "palimpzest"
3
- version = "0.7.1"
3
+ version = "0.7.3"
4
4
  description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.8"
@@ -9,45 +9,25 @@ authors = [
9
9
  {name="MIT DSG Semantic Management Lab", email="michjc@csail.mit.edu"},
10
10
  ]
11
11
  dependencies = [
12
- "charset-normalizer>=3.3.2",
13
12
  "chromadb>=0.6.3",
14
- "click>=8.1.7",
15
- "click-aliases>=1.0.4",
16
- "colorama>=0.4.6",
17
13
  "fastapi~=0.115.0",
18
- "fuzzywuzzy>=0.18.0",
19
- "google-generativeai>=0.8.0",
20
- "gradio>=4.20.1",
21
- "grobid-client-python==0.0.5",
22
- "ipython>=8.26.0",
23
- "opencv-python-headless>=4.8.0,<4.9.0",
24
- "jupyter",
25
- "layoutparser>=0.3.4",
26
- "lxml-html-clean>=0.1.1",
14
+ "gradio>=5.26.0",
27
15
  "mkdocs>=1.6.1",
28
16
  "mkdocs-material>=9.6.3",
29
17
  "mkdocs-material[imaging]",
30
18
  "mkdocstrings-python>=1.15.0",
31
- "modal>=0.62.198",
32
- "ncls==0.0.68",
33
- "necessary>=0.3.2",
34
19
  "numpy>=1.23.2",
35
20
  "openai>=1.0",
36
- "openpyxl==3.1.2",
37
21
  "pandas>=2.1.1",
38
- "papermage>=0.16.0",
39
- "pdf2image",
40
22
  "pytest>=8.2.2",
41
- "python-Levenshtein",
42
- "pdfplumber==0.7.4",
43
- "pillow>=10.2.0",
23
+ "pillow",
44
24
  "prettytable>=3.9.0",
25
+ "psutil>=7.0.0",
45
26
  "PyLD>=2.0.4",
46
27
  "pyarrow>=13.0.0,<15.0.0; python_version<'3.12'",
47
28
  "pyarrow>=15.0.0,<19.0.0; python_version>='3.12'",
48
29
  "pypdf>=5.1.0",
49
30
  "pytest-mock>=3.14.0",
50
- "python-Levenshtein>=0.25.1",
51
31
  "pyyaml>=6.0.1",
52
32
  "ragatouille>=0.0.9",
53
33
  "requests>=2.25",
@@ -58,7 +38,6 @@ dependencies = [
58
38
  "tqdm~=4.66.1",
59
39
  "transformers>=4.41.3,<4.50.0",
60
40
  "rich[jupyter]>=13.9.2",
61
- "voyager>=2.0.9",
62
41
  ]
63
42
  classifiers=[
64
43
  "Development Status :: 4 - Beta", # Change as appropriate
@@ -1,15 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import base64
4
- import json
5
4
  import os
6
5
  from abc import ABC, abstractmethod
7
6
  from io import BytesIO
8
7
 
9
- import modal
10
8
  import pandas as pd
11
9
  from bs4 import BeautifulSoup
12
- from papermage import Document
13
10
 
14
11
  from palimpzest import constants
15
12
  from palimpzest.core.lib.schemas import (
@@ -390,22 +387,8 @@ class PDFFileDirectoryReader(DirectoryReader):
390
387
  with open(filepath, "rb") as f:
391
388
  pdf_bytes = f.read()
392
389
 
393
- if self.pdfprocessor == "modal":
394
- print("handling PDF processing remotely")
395
- remote_func = modal.Function.lookup("palimpzest.tools", "processPapermagePdf")
396
- else:
397
- remote_func = None
398
-
399
390
  # generate text_content from PDF
400
- if remote_func is not None:
401
- doc_json_str = remote_func.remote([pdf_bytes])
402
- docdict = json.loads(doc_json_str[0])
403
- doc = Document.from_json(docdict)
404
- text_content = ""
405
- for p in doc.pages:
406
- text_content += p.text
407
- else:
408
- text_content = get_text_from_pdf(pdf_filename, pdf_bytes, pdfprocessor=self.pdfprocessor, file_cache_dir=self.file_cache_dir)
391
+ text_content = get_text_from_pdf(pdf_filename, pdf_bytes, pdfprocessor=self.pdfprocessor, file_cache_dir=self.file_cache_dir)
409
392
 
410
393
  # construct and return item
411
394
  return {"filename": pdf_filename, "contents": pdf_bytes, "text_contents": text_content}
@@ -64,8 +64,7 @@ class RAGConvert(LLMConvert):
64
64
  + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
65
65
  )
66
66
 
67
- # set refined estimate of cost per record and, for now,
68
- # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
67
+ # set refined estimate of cost per record
69
68
  naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
70
69
  naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
71
70
  naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
@@ -61,8 +61,7 @@ class SplitConvert(LLMConvert):
61
61
  + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
62
62
  )
63
63
 
64
- # set refined estimate of cost per record and, for now,
65
- # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
64
+ # set refined estimate of cost per record
66
65
  naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
67
66
  naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
68
67
  naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
@@ -46,9 +46,6 @@ from palimpzest.query.optimizer.rules import (
46
46
  from palimpzest.query.optimizer.rules import (
47
47
  SplitConvertRule as _SplitConvertRule,
48
48
  )
49
- from palimpzest.query.optimizer.rules import (
50
- TokenReducedConvertBondedRule as _TokenReducedConvertBondedRule,
51
- )
52
49
  from palimpzest.query.optimizer.rules import (
53
50
  TransformationRule as _TransformationRule,
54
51
  )
@@ -70,7 +67,6 @@ ALL_RULES = [
70
67
  _RetrieveRule,
71
68
  _Rule,
72
69
  _SplitConvertRule,
73
- _TokenReducedConvertBondedRule,
74
70
  _TransformationRule,
75
71
  ]
76
72
 
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
- import math
5
4
 
6
5
  # NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
7
6
  # answers to a convert with the same mode. This is because pandas tries to sort the answers
@@ -24,7 +23,6 @@ from palimpzest.query.operators.limit import LimitScanOp
24
23
  from palimpzest.query.operators.physical import PhysicalOperator
25
24
  from palimpzest.query.operators.rag_convert import RAGConvert
26
25
  from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
27
- from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
28
26
  from palimpzest.utils.model_helpers import get_champion_model_name, get_models
29
27
 
30
28
  warnings.simplefilter(action='ignore', category=UserWarning)
@@ -574,16 +572,6 @@ class CostModel(BaseCostModel):
574
572
  op_estimates.cost_per_record = 1e-4
575
573
  op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
576
574
 
577
- # token reduction adjustment
578
- if isinstance(operator, TokenReducedConvertBonded):
579
- total_input_tokens = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens"]
580
- total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
581
- op_estimates.cost_per_record = (
582
- MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
583
- + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
584
- )
585
- op_estimates.quality = op_estimates.quality * math.sqrt(math.sqrt(operator.token_budget))
586
-
587
575
  # rag convert adjustment
588
576
  if isinstance(operator, RAGConvert):
589
577
  total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
@@ -34,7 +34,6 @@ from palimpzest.query.optimizer.rules import (
34
34
  MixtureOfAgentsConvertRule,
35
35
  RAGConvertRule,
36
36
  SplitConvertRule,
37
- TokenReducedConvertBondedRule,
38
37
  )
39
38
  from palimpzest.query.optimizer.tasks import (
40
39
  ApplyRule,
@@ -90,7 +89,6 @@ class Optimizer:
90
89
  verbose: bool = False,
91
90
  allow_bonded_query: bool = True,
92
91
  allow_code_synth: bool = False,
93
- allow_token_reduction: bool = False,
94
92
  allow_rag_reduction: bool = False,
95
93
  allow_mixtures: bool = True,
96
94
  allow_critic: bool = False,
@@ -134,7 +132,6 @@ class Optimizer:
134
132
  if optimizer_strategy == OptimizationStrategyType.NONE:
135
133
  self.allow_bonded_query = True
136
134
  self.allow_code_synth = False
137
- self.allow_token_reduction = False
138
135
  self.allow_rag_reduction = False
139
136
  self.allow_mixtures = False
140
137
  self.allow_critic = False
@@ -147,7 +144,6 @@ class Optimizer:
147
144
  self.available_models = available_models
148
145
  self.allow_bonded_query = allow_bonded_query
149
146
  self.allow_code_synth = allow_code_synth
150
- self.allow_token_reduction = allow_token_reduction
151
147
  self.allow_rag_reduction = allow_rag_reduction
152
148
  self.allow_mixtures = allow_mixtures
153
149
  self.allow_critic = allow_critic
@@ -160,7 +156,7 @@ class Optimizer:
160
156
  self.implementation_rules = [
161
157
  rule
162
158
  for rule in self.implementation_rules
163
- if rule not in [LLMConvertBondedRule, TokenReducedConvertBondedRule]
159
+ if rule not in [LLMConvertBondedRule]
164
160
  ]
165
161
 
166
162
  if not self.allow_code_synth:
@@ -168,11 +164,6 @@ class Optimizer:
168
164
  rule for rule in self.implementation_rules if not issubclass(rule, CodeSynthesisConvertRule)
169
165
  ]
170
166
 
171
- if not self.allow_token_reduction:
172
- self.implementation_rules = [
173
- rule for rule in self.implementation_rules if not issubclass(rule, TokenReducedConvertBondedRule)
174
- ]
175
-
176
167
  if not self.allow_rag_reduction:
177
168
  self.implementation_rules = [
178
169
  rule for rule in self.implementation_rules if not issubclass(rule, RAGConvertRule)
@@ -218,7 +209,6 @@ class Optimizer:
218
209
  available_models=self.available_models,
219
210
  allow_bonded_query=self.allow_bonded_query,
220
211
  allow_code_synth=self.allow_code_synth,
221
- allow_token_reduction=self.allow_token_reduction,
222
212
  allow_rag_reduction=self.allow_rag_reduction,
223
213
  allow_mixtures=self.allow_mixtures,
224
214
  allow_critic=self.allow_critic,
@@ -28,7 +28,6 @@ from palimpzest.query.operators.rag_convert import RAGConvert
28
28
  from palimpzest.query.operators.retrieve import RetrieveOp
29
29
  from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp
30
30
  from palimpzest.query.operators.split_convert import SplitConvert
31
- from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
32
31
  from palimpzest.query.optimizer.primitives import Expression, Group, LogicalExpression, PhysicalExpression
33
32
  from palimpzest.utils.model_helpers import get_models, get_vision_models
34
33
 
@@ -352,81 +351,6 @@ class LLMConvertBondedRule(ImplementationRule):
352
351
  return deduped_physical_expressions
353
352
 
354
353
 
355
- class TokenReducedConvertBondedRule(ImplementationRule):
356
- """
357
- Substitute a logical expression for a ConvertScan with a bonded token reduced physical implementation.
358
- """
359
-
360
- token_budgets = [0.1, 0.5, 0.9]
361
-
362
- @classmethod
363
- def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
364
- logical_op = logical_expression.operator
365
- is_image_conversion = any(
366
- [
367
- field.is_image_field
368
- for field_name, field in logical_expression.input_fields.items()
369
- if field_name.split(".")[-1] in logical_expression.depends_on_field_names
370
- ]
371
- )
372
- is_match = isinstance(logical_op, ConvertScan) and not is_image_conversion and logical_op.udf is None
373
- logger.debug(f"TokenReducedConvertBondedRule matches_pattern: {is_match} for {logical_expression}")
374
- return is_match
375
-
376
- @classmethod
377
- def substitute(cls, logical_expression: LogicalExpression, **physical_op_params) -> set[PhysicalExpression]:
378
- logger.debug(f"Substituting TokenReducedConvertBondedRule for {logical_expression}")
379
-
380
- logical_op = logical_expression.operator
381
-
382
- # get initial set of parameters for physical op
383
- op_kwargs = logical_op.get_logical_op_params()
384
- op_kwargs.update(
385
- {
386
- "verbose": physical_op_params["verbose"],
387
- "logical_op_id": logical_op.get_logical_op_id(),
388
- "logical_op_name": logical_op.logical_op_name(),
389
- }
390
- )
391
-
392
- # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
393
- # thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
394
- #
395
- # identify models which can be used strictly for text or strictly for images
396
- vision_models = set(get_vision_models())
397
- text_models = set(get_models())
398
- pure_vision_models = {model for model in vision_models if model not in text_models}
399
-
400
- physical_expressions = []
401
- for model in physical_op_params["available_models"]:
402
- for token_budget in cls.token_budgets:
403
- # skip this model if this is a pure image model
404
- if model in pure_vision_models:
405
- continue
406
-
407
- # construct multi-expression
408
- op = TokenReducedConvertBonded(
409
- model=model,
410
- prompt_strategy=PromptStrategy.COT_QA,
411
- token_budget=token_budget,
412
- **op_kwargs,
413
- )
414
- expression = PhysicalExpression(
415
- operator=op,
416
- input_group_ids=logical_expression.input_group_ids,
417
- input_fields=logical_expression.input_fields,
418
- depends_on_field_names=logical_expression.depends_on_field_names,
419
- generated_fields=logical_expression.generated_fields,
420
- group_id=logical_expression.group_id,
421
- )
422
- physical_expressions.append(expression)
423
-
424
- logger.debug(f"Done substituting TokenReducedConvertBondedRule for {logical_expression}")
425
- deduped_physical_expressions = set(physical_expressions)
426
-
427
- return deduped_physical_expressions
428
-
429
-
430
354
  class CodeSynthesisConvertRule(ImplementationRule):
431
355
  """
432
356
  Base rule for code synthesis convert operators; the physical convert class
@@ -31,7 +31,6 @@ class QueryProcessorConfig:
31
31
  allow_bonded_query: bool = field(default=True)
32
32
  allow_model_selection: bool = field(default=True)
33
33
  allow_code_synth: bool = field(default=False)
34
- allow_token_reduction: bool = field(default=False)
35
34
  allow_rag_reduction: bool = field(default=True)
36
35
  allow_mixtures: bool = field(default=True)
37
36
  allow_critic: bool = field(default=True)
@@ -59,7 +58,6 @@ class QueryProcessorConfig:
59
58
  "allow_bonded_query": self.allow_bonded_query,
60
59
  "allow_model_selection": self.allow_model_selection,
61
60
  "allow_code_synth": self.allow_code_synth,
62
- "allow_token_reduction": self.allow_token_reduction,
63
61
  "allow_rag_reduction": self.allow_rag_reduction,
64
62
  "allow_mixtures": self.allow_mixtures,
65
63
  "allow_critic": self.allow_critic,
@@ -47,8 +47,6 @@ def create_plan_str(flatten_ops):
47
47
  else str(right.filter.filter_fn)
48
48
  )
49
49
  plan_str += f'\n Filter: "{filter_str}"'
50
- if hasattr(right, "token_budget"):
51
- plan_str += f"\n Token budget: {right.token_budget}"
52
50
  plan_str += "\n"
53
51
  plan_str += (
54
52
  f" ({','.join(in_schema.field_names())[:15]}...) -> ({','.join(out_schema.field_names())[:15]}...)"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: palimpzest
3
- Version: 0.7.1
3
+ Version: 0.7.3
4
4
  Summary: Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language
5
5
  Author-email: MIT DSG Semantic Management Lab <michjc@csail.mit.edu>
6
6
  Project-URL: homepage, https://palimpzest.org
@@ -15,45 +15,25 @@ Classifier: Programming Language :: Python :: 3.8
15
15
  Requires-Python: >=3.8
16
16
  Description-Content-Type: text/markdown
17
17
  License-File: LICENSE
18
- Requires-Dist: charset-normalizer>=3.3.2
19
18
  Requires-Dist: chromadb>=0.6.3
20
- Requires-Dist: click>=8.1.7
21
- Requires-Dist: click-aliases>=1.0.4
22
- Requires-Dist: colorama>=0.4.6
23
19
  Requires-Dist: fastapi~=0.115.0
24
- Requires-Dist: fuzzywuzzy>=0.18.0
25
- Requires-Dist: google-generativeai>=0.8.0
26
- Requires-Dist: gradio>=4.20.1
27
- Requires-Dist: grobid-client-python==0.0.5
28
- Requires-Dist: ipython>=8.26.0
29
- Requires-Dist: opencv-python-headless<4.9.0,>=4.8.0
30
- Requires-Dist: jupyter
31
- Requires-Dist: layoutparser>=0.3.4
32
- Requires-Dist: lxml-html-clean>=0.1.1
20
+ Requires-Dist: gradio>=5.26.0
33
21
  Requires-Dist: mkdocs>=1.6.1
34
22
  Requires-Dist: mkdocs-material>=9.6.3
35
23
  Requires-Dist: mkdocs-material[imaging]
36
24
  Requires-Dist: mkdocstrings-python>=1.15.0
37
- Requires-Dist: modal>=0.62.198
38
- Requires-Dist: ncls==0.0.68
39
- Requires-Dist: necessary>=0.3.2
40
25
  Requires-Dist: numpy>=1.23.2
41
26
  Requires-Dist: openai>=1.0
42
- Requires-Dist: openpyxl==3.1.2
43
27
  Requires-Dist: pandas>=2.1.1
44
- Requires-Dist: papermage>=0.16.0
45
- Requires-Dist: pdf2image
46
28
  Requires-Dist: pytest>=8.2.2
47
- Requires-Dist: python-Levenshtein
48
- Requires-Dist: pdfplumber==0.7.4
49
- Requires-Dist: pillow>=10.2.0
29
+ Requires-Dist: pillow
50
30
  Requires-Dist: prettytable>=3.9.0
31
+ Requires-Dist: psutil>=7.0.0
51
32
  Requires-Dist: PyLD>=2.0.4
52
33
  Requires-Dist: pyarrow<15.0.0,>=13.0.0; python_version < "3.12"
53
34
  Requires-Dist: pyarrow<19.0.0,>=15.0.0; python_version >= "3.12"
54
35
  Requires-Dist: pypdf>=5.1.0
55
36
  Requires-Dist: pytest-mock>=3.14.0
56
- Requires-Dist: python-Levenshtein>=0.25.1
57
37
  Requires-Dist: pyyaml>=6.0.1
58
38
  Requires-Dist: ragatouille>=0.0.9
59
39
  Requires-Dist: requests>=2.25
@@ -64,7 +44,6 @@ Requires-Dist: together>=1.3.1
64
44
  Requires-Dist: tqdm~=4.66.1
65
45
  Requires-Dist: transformers<4.50.0,>=4.41.3
66
46
  Requires-Dist: rich[jupyter]>=13.9.2
67
- Requires-Dist: voyager>=2.0.9
68
47
  Dynamic: license-file
69
48
 
70
49
  ![pz-banner](https://palimpzest-workloads.s3.us-east-1.amazonaws.com/palimpzest-cropped.png)
@@ -60,7 +60,6 @@ src/palimpzest/query/operators/rag_convert.py
60
60
  src/palimpzest/query/operators/retrieve.py
61
61
  src/palimpzest/query/operators/scan.py
62
62
  src/palimpzest/query/operators/split_convert.py
63
- src/palimpzest/query/operators/token_reduction_convert.py
64
63
  src/palimpzest/query/optimizer/__init__.py
65
64
  src/palimpzest/query/optimizer/cost_model.py
66
65
  src/palimpzest/query/optimizer/optimizer.py
@@ -95,5 +94,4 @@ src/palimpzest/utils/hash_helpers.py
95
94
  src/palimpzest/utils/model_helpers.py
96
95
  src/palimpzest/utils/progress.py
97
96
  src/palimpzest/utils/sandbox.py
98
- src/palimpzest/utils/token_reduction_helpers.py
99
97
  src/palimpzest/utils/udfs.py
@@ -1,40 +1,20 @@
1
- charset-normalizer>=3.3.2
2
1
  chromadb>=0.6.3
3
- click>=8.1.7
4
- click-aliases>=1.0.4
5
- colorama>=0.4.6
6
2
  fastapi~=0.115.0
7
- fuzzywuzzy>=0.18.0
8
- google-generativeai>=0.8.0
9
- gradio>=4.20.1
10
- grobid-client-python==0.0.5
11
- ipython>=8.26.0
12
- opencv-python-headless<4.9.0,>=4.8.0
13
- jupyter
14
- layoutparser>=0.3.4
15
- lxml-html-clean>=0.1.1
3
+ gradio>=5.26.0
16
4
  mkdocs>=1.6.1
17
5
  mkdocs-material>=9.6.3
18
6
  mkdocs-material[imaging]
19
7
  mkdocstrings-python>=1.15.0
20
- modal>=0.62.198
21
- ncls==0.0.68
22
- necessary>=0.3.2
23
8
  numpy>=1.23.2
24
9
  openai>=1.0
25
- openpyxl==3.1.2
26
10
  pandas>=2.1.1
27
- papermage>=0.16.0
28
- pdf2image
29
11
  pytest>=8.2.2
30
- python-Levenshtein
31
- pdfplumber==0.7.4
32
- pillow>=10.2.0
12
+ pillow
33
13
  prettytable>=3.9.0
14
+ psutil>=7.0.0
34
15
  PyLD>=2.0.4
35
16
  pypdf>=5.1.0
36
17
  pytest-mock>=3.14.0
37
- python-Levenshtein>=0.25.1
38
18
  pyyaml>=6.0.1
39
19
  ragatouille>=0.0.9
40
20
  requests>=2.25
@@ -45,7 +25,6 @@ together>=1.3.1
45
25
  tqdm~=4.66.1
46
26
  transformers<4.50.0,>=4.41.3
47
27
  rich[jupyter]>=13.9.2
48
- voyager>=2.0.9
49
28
 
50
29
  [:python_version < "3.12"]
51
30
  pyarrow<15.0.0,>=13.0.0
@@ -1,169 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import math
4
- from typing import Any
5
-
6
- from palimpzest.constants import (
7
- MODEL_CARDS,
8
- NAIVE_EST_NUM_INPUT_TOKENS,
9
- NAIVE_EST_NUM_OUTPUT_TOKENS,
10
- )
11
- from palimpzest.core.data.dataclasses import OperatorCostEstimates
12
- from palimpzest.query.operators.convert import LLMConvertBonded
13
- from palimpzest.utils.token_reduction_helpers import best_substring_match, find_best_range
14
-
15
-
16
- # NOTE: this convert operation will not work with the new generation abstraction, and it needs to be worked on.
17
- # There are two minor issues with the operator as it exists:
18
- #
19
- # 1) The token reduction operation operated over the entire JSON string of the input DataRecord
20
- # - while this works in practice, it makes it difficult to use this operator with a generation framework
21
- # where each field may be placed in a specific place in the format string for a prompt
22
- # - we need to either (A) rewrite the reduction to take place on a field-by-field basis (or at least
23
- # make it possible to recover each field after a global reduction) or (B) add custom logic within
24
- # the Generator class(es) to handle this operator [I much prefer (A) over (B)]
25
- #
26
- # 2) The heatmap update logic does not translate well to the distributed setting, where this operator may
27
- # be copied and executed many times in parallel
28
- # - each copy of the operator will have its own heatmap and require MAX_HEATMAP_UPDATES just to enter the
29
- # phase where token reduction takes place
30
- # - this means that if we have 20-way parallelism and a MAX_HEATMAP_UPDATES = 5, it can take 100 inputs
31
- # before token reduction ever takes place
32
- # - this also creates difficulties in properly performing cost-estimation for this operator; e.g. if we use
33
- # n <= MAX_HEATMAP_UPDATES samples to cost this operator, then we will never actually measure its performance
34
- # in the token reduction phase -- which could have a serious degradation in quality that our optimizer doesn't see
35
- class TokenReducedConvertBonded(LLMConvertBonded):
36
- # NOTE: moving these closer to the TokenReducedConvertBonded class for now (in part to make
37
- # them easier to mock); we can make these parameterized as well
38
- MAX_HEATMAP_UPDATES: int = 5
39
- TOKEN_REDUCTION_SAMPLE: int = 0
40
- TOKEN_REDUCTION_GRANULARITY: float = 0.001
41
-
42
- def __init__(self, token_budget: float, *args, **kwargs):
43
- super().__init__(*args, **kwargs)
44
- self.token_budget = token_budget
45
- self.resolution = self.TOKEN_REDUCTION_GRANULARITY
46
- self.first_execution = True
47
- self.count = 0
48
- self.heatmap = [0] * int(1.0 / self.resolution)
49
-
50
- def __str__(self):
51
- op = super().__str__()
52
- op += f" Token Budget: {str(self.token_budget)}\n"
53
- return op
54
-
55
- def get_id_params(self):
56
- id_params = super().get_id_params()
57
- id_params = {"token_budget": self.token_budget, **id_params}
58
-
59
- return id_params
60
-
61
- def get_op_params(self):
62
- op_params = super().get_op_params()
63
- return {"token_budget": self.token_budget, **op_params}
64
-
65
- def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
66
- """
67
- Update the cost per record and quality estimates produced by LLMConvert's naive estimates.
68
- We adjust the cost per record to account for the reduced number of input tokens following
69
- token reduction, and we make a crude estimate of the quality degradation that results from
70
- using fewer tokens.
71
- """
72
- # get naive cost estimates from LLMConvert
73
- naive_op_cost_estimates = super().naive_cost_estimates(source_op_cost_estimates)
74
-
75
- # re-compute cost per record assuming we use fewer input tokens
76
- est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * self.token_budget
77
- est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
78
- model_conversion_usd_per_record = (
79
- MODEL_CARDS[self.model.value]["usd_per_input_token"] * est_num_input_tokens
80
- + MODEL_CARDS[self.model.value]["usd_per_output_token"] * est_num_output_tokens
81
- )
82
-
83
- # set refined estimate of cost per record and, for now,
84
- # assume quality multiplier is proportional to sqrt(sqrt(token_budget))
85
- naive_op_cost_estimates.cost_per_record = model_conversion_usd_per_record
86
- naive_op_cost_estimates.cost_per_record_lower_bound = naive_op_cost_estimates.cost_per_record
87
- naive_op_cost_estimates.cost_per_record_upper_bound = naive_op_cost_estimates.cost_per_record
88
- naive_op_cost_estimates.quality = (naive_op_cost_estimates.quality) * math.sqrt(math.sqrt(self.token_budget))
89
- naive_op_cost_estimates.quality_lower_bound = naive_op_cost_estimates.quality
90
- naive_op_cost_estimates.quality_upper_bound = naive_op_cost_estimates.quality
91
-
92
- return naive_op_cost_estimates
93
-
94
- def is_image_conversion(self) -> bool:
95
- """TokenReducedConvertBonded is currently disallowed on image conversions, so this must be False."""
96
- return False
97
-
98
- def reduce_context(self, full_context: str) -> str:
99
- range = find_best_range(
100
- self.heatmap,
101
- int(self.token_budget / self.TOKEN_REDUCTION_GRANULARITY),
102
- trim_zeros=False,
103
- )
104
- if not range:
105
- raise Exception("No range found in heatmap")
106
- si, ei = range
107
- print("si:", si, "ei:", ei)
108
- sr, er = (
109
- si * self.TOKEN_REDUCTION_GRANULARITY,
110
- ei * self.TOKEN_REDUCTION_GRANULARITY,
111
- )
112
- test_len = len(full_context)
113
- start = int(sr * test_len)
114
- end = int(er * test_len)
115
- if self.verbose:
116
- print(f"start ratio: {sr} -- end ratio: {er}")
117
- print("character start:", start, "end:", end)
118
- sample = full_context[start:end]
119
- return sample
120
-
121
- def _dspy_generate_fields(self, prompt: str, content: str | list[str]) -> tuple[list[dict[str, list]] | Any]:
122
- raise Exception(
123
- "TokenReducedConvertBonded is executing despite being deprecated until implementation changes can be made."
124
- )
125
- answer, query_stats = None, None
126
- if self.first_execution or self.count < self.MAX_HEATMAP_UPDATES:
127
- if self.verbose:
128
- print("Warming up heatmap")
129
- answer, query_stats = super()._dspy_generate_fields(prompt, content)
130
- self.first_execution = False
131
-
132
- else:
133
- if self.verbose:
134
- print("Using heatmap")
135
-
136
- # only refer to the heatmap if the count is greater than a enough sample size
137
- # TODO: only trim the context if the attention is clustered in a small region
138
- if self.count >= self.TOKEN_REDUCTION_SAMPLE:
139
- context = self.reduce_context(content)
140
- try:
141
- answer, _, query_stats = self.generator.generate(context=context, prompt=prompt)
142
- except Exception as e:
143
- print(f"DSPy generation error: {e}, falling back to unreduced generation")
144
- answer, query_stats = super()._dspy_generate_fields(prompt, content)
145
-
146
- # TODO: answer and query stats may be unbound if we hit the else block
147
- # and count < TOKEN_REDUCTION_SAMPLE, which makes the below pretty clunky
148
- # this throw asserts our view of the world and we should refactor this
149
- if answer is None or query_stats is None:
150
- raise Exception("answer or query_stats is None")
151
- try:
152
- match = best_substring_match(answer, content)
153
- if not match:
154
- gsi, gei = 0, len(content)
155
- else:
156
- gsi, gei = match
157
- except Exception as e:
158
- print("Error in substring match:", e)
159
- gsi, gei = 0, len(content)
160
- context_len = len(content)
161
- gsr, ger = gsi / context_len, gei / context_len
162
- norm_si, norm_ei = int(gsr / self.resolution), int(ger / self.resolution)
163
- if self.verbose:
164
- print(f"best_start: {gsi} -- best_end: {gei}")
165
-
166
- self.count += 1
167
- self.heatmap[norm_si:norm_ei] = map(lambda x: x + 1, self.heatmap[norm_si:norm_ei])
168
-
169
- return answer, query_stats
@@ -1,105 +0,0 @@
1
- from fuzzywuzzy import fuzz, process
2
-
3
-
4
- def find_best_range(values, budget, trim_zeros=False):
5
- """
6
- Finds the consecutive range with the biggest sum within a budget.
7
-
8
- Args:
9
- values: A list of non-negative numbers.
10
- budget: The maximum number of consecutive elements to consider.
11
-
12
- Returns:
13
- A tuple containing the start and end indices (inclusive) of the best range,
14
- or None if the array is empty.
15
- """
16
- if not values:
17
- return None
18
-
19
- n = len(values)
20
- best_sum, best_start, current_sum, current_start = 0, 0, 0, 0
21
-
22
- # Iterate through the array, keeping track of current and best ranges.
23
- for i in range(n):
24
- current_sum += values[i]
25
-
26
- # If the current range exceeds the budget, remove elements from the beginning.
27
- while current_start + budget - 1 < i and current_start + budget - 1 >= 0:
28
- current_sum -= values[current_start]
29
- current_start += 1
30
-
31
- # Update best range if the current sum is bigger.
32
- if current_sum > best_sum:
33
- best_sum = current_sum
34
- best_start = current_start
35
-
36
- best_end = best_start + budget - 1
37
- print("best_start:", best_start, "best_end:", best_end)
38
- if trim_zeros:
39
- # Trim leading/trailing zeros
40
- while best_start >= 0 and values[best_start] == 0:
41
- best_start += 1
42
-
43
- while best_end < n and values[best_end] == 0:
44
- best_end -= 1
45
- else:
46
- # balance the zero entries equally on both sides
47
- leading_zeros = 0
48
- trailing_zeros = 0
49
- start_idx = best_start
50
- end_idx = best_end
51
- while start_idx >= 0 and values[start_idx] == 0:
52
- leading_zeros += 1
53
- start_idx += 1
54
- while end_idx < n and values[end_idx] == 0:
55
- trailing_zeros += 1
56
- end_idx -= 1
57
- half_zeros = int((leading_zeros + trailing_zeros) / 2)
58
- print("leading_zeros:", leading_zeros, "trailing_zeros:", trailing_zeros, "half_zeros:", half_zeros)
59
- best_start = best_start - half_zeros + leading_zeros
60
- best_end = best_end - trailing_zeros + leading_zeros + trailing_zeros - half_zeros
61
-
62
- if best_start < 0:
63
- best_end = best_end - best_start
64
- best_start = 0
65
- if best_end >= n:
66
- best_start = best_start - (best_end - n + 1)
67
- best_end = n - 1
68
-
69
- return best_start, best_end + 1
70
-
71
-
72
- def get_range_from_hist(file_path, range_budget, resolution=0.001, trim_zeros=True):
73
- # Load data from csv file and extract he second column as values
74
- values = []
75
- with open(file_path) as file:
76
- for line in file:
77
- line = line.strip()
78
- values.append(int(float(line.split(",")[1])))
79
- index_range = 1 / resolution
80
- budget = int(range_budget * index_range)
81
- # Find the best range
82
- range = find_best_range(values, budget, trim_zeros=trim_zeros)
83
- if not range:
84
- raise ValueError("No range found")
85
- start, end = range
86
- print("start:", start, "end:", end, "index_range:", index_range)
87
- return start * 1.0 / index_range, end * 1.0 / index_range
88
-
89
-
90
- def best_substring_match(query: str, context: str | list[str]):
91
- # This will extract all substrings of length equal to the query from the string
92
- candidates = [context[i : i + len(query)] for i in range(len(context) - len(query) + 1)]
93
-
94
- # Find the best match among the candidates
95
- ret = process.extractOne(query, candidates, scorer=fuzz.ratio)
96
- if ret is None:
97
- return None
98
-
99
- best_match, score = ret
100
- positions = [can == best_match for can in candidates]
101
- start = positions.index(True)
102
- end = start + len(query)
103
- # print("best match:", best_match, "score:", score, "start:", start, "end:", end)
104
- # print("-------", string[start:end])
105
- return start, end
File without changes
File without changes
File without changes