poma 0.1.2__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poma-0.2.1/.github/workflows/python-publish.yml +71 -0
- poma-0.2.1/.gitignore +56 -0
- {poma-0.1.2 → poma-0.2.1}/PKG-INFO +4 -4
- {poma-0.1.2 → poma-0.2.1}/README.md +3 -3
- {poma-0.1.2 → poma-0.2.1}/poma/client.py +73 -59
- {poma-0.1.2 → poma-0.2.1}/poma/exceptions.py +1 -0
- {poma-0.1.2 → poma-0.2.1}/poma/integrations/langchain_poma.py +7 -17
- {poma-0.1.2 → poma-0.2.1}/poma/integrations/llamaindex_poma.py +7 -16
- poma-0.2.1/poma/retrieval.py +339 -0
- {poma-0.1.2 → poma-0.2.1}/poma.egg-info/PKG-INFO +4 -4
- {poma-0.1.2 → poma-0.2.1}/poma.egg-info/SOURCES.txt +2 -0
- {poma-0.1.2 → poma-0.2.1}/pyproject.toml +2 -2
- poma-0.1.2/poma/retrieval.py +0 -176
- {poma-0.1.2 → poma-0.2.1}/LICENSE +0 -0
- {poma-0.1.2 → poma-0.2.1}/poma/__init__.py +0 -0
- {poma-0.1.2 → poma-0.2.1}/poma/integrations/__init__.py +0 -0
- {poma-0.1.2 → poma-0.2.1}/poma.egg-info/dependency_links.txt +0 -0
- {poma-0.1.2 → poma-0.2.1}/poma.egg-info/requires.txt +0 -0
- {poma-0.1.2 → poma-0.2.1}/poma.egg-info/top_level.txt +0 -0
- {poma-0.1.2 → poma-0.2.1}/setup.cfg +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# This workflow will upload a Python Package to PyPI when a release is created
|
|
2
|
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
|
|
3
|
+
|
|
4
|
+
# This workflow uses actions that are not certified by GitHub.
|
|
5
|
+
# They are provided by a third-party and are governed by
|
|
6
|
+
# separate terms of service, privacy policy, and support
|
|
7
|
+
# documentation.
|
|
8
|
+
|
|
9
|
+
name: Upload Python Package
|
|
10
|
+
|
|
11
|
+
on:
|
|
12
|
+
push:
|
|
13
|
+
tags: [ '*.*.*' ]
|
|
14
|
+
|
|
15
|
+
permissions:
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
release-build:
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- uses: actions/setup-python@v5
|
|
26
|
+
with:
|
|
27
|
+
python-version: "3.x"
|
|
28
|
+
|
|
29
|
+
- name: Build release distributions
|
|
30
|
+
run: |
|
|
31
|
+
# NOTE: put your own distribution build steps here.
|
|
32
|
+
python -m pip install --upgrade pip
|
|
33
|
+
python -m pip install --upgrade build
|
|
34
|
+
python -m build
|
|
35
|
+
|
|
36
|
+
- name: Upload distributions
|
|
37
|
+
uses: actions/upload-artifact@v4
|
|
38
|
+
with:
|
|
39
|
+
name: release-dists
|
|
40
|
+
path: dist/
|
|
41
|
+
|
|
42
|
+
pypi-publish:
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
needs:
|
|
45
|
+
- release-build
|
|
46
|
+
permissions:
|
|
47
|
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
|
48
|
+
id-token: write
|
|
49
|
+
|
|
50
|
+
# Dedicated environments with protections for publishing are strongly recommended.
|
|
51
|
+
# For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
|
|
52
|
+
environment:
|
|
53
|
+
name: pypi
|
|
54
|
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
|
55
|
+
# url: https://pypi.org/p/YOURPROJECT
|
|
56
|
+
#
|
|
57
|
+
# ALTERNATIVE: if your GitHub Release name is the PyPI project version string
|
|
58
|
+
# ALTERNATIVE: exactly, uncomment the following line instead:
|
|
59
|
+
# url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
|
|
60
|
+
|
|
61
|
+
steps:
|
|
62
|
+
- name: Retrieve release distributions
|
|
63
|
+
uses: actions/download-artifact@v4
|
|
64
|
+
with:
|
|
65
|
+
name: release-dists
|
|
66
|
+
path: dist/
|
|
67
|
+
|
|
68
|
+
- name: Publish release distributions to PyPI
|
|
69
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
70
|
+
with:
|
|
71
|
+
packages-dir: dist/
|
poma-0.2.1/.gitignore
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# -------------------------------------------------------
|
|
2
|
+
# Python bytecode
|
|
3
|
+
# -------------------------------------------------------
|
|
4
|
+
__pycache__/
|
|
5
|
+
*.py[cod]
|
|
6
|
+
*$py.class
|
|
7
|
+
|
|
8
|
+
# -------------------------------------------------------
|
|
9
|
+
# Virtualenv (Windows + Unix)
|
|
10
|
+
# -------------------------------------------------------
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
env/
|
|
14
|
+
.poma-sdk-venv/
|
|
15
|
+
ENV/
|
|
16
|
+
env.bak/
|
|
17
|
+
.idea/
|
|
18
|
+
|
|
19
|
+
# -------------------------------------------------------
|
|
20
|
+
# VS Code
|
|
21
|
+
# -------------------------------------------------------
|
|
22
|
+
.vscode/
|
|
23
|
+
|
|
24
|
+
# -------------------------------------------------------
|
|
25
|
+
# Pytest
|
|
26
|
+
# -------------------------------------------------------
|
|
27
|
+
.cache/
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
nosetests.xml
|
|
30
|
+
coverage.xml
|
|
31
|
+
*.cover
|
|
32
|
+
*.py,cover
|
|
33
|
+
.hypothesis/
|
|
34
|
+
|
|
35
|
+
# -------------------------------------------------------
|
|
36
|
+
# Build / Dist
|
|
37
|
+
# -------------------------------------------------------
|
|
38
|
+
build/
|
|
39
|
+
dist/
|
|
40
|
+
*.egg-info/
|
|
41
|
+
*.egg
|
|
42
|
+
.eggs/
|
|
43
|
+
|
|
44
|
+
# -------------------------------------------------------
|
|
45
|
+
# Environment files
|
|
46
|
+
# -------------------------------------------------------
|
|
47
|
+
.env
|
|
48
|
+
.env.*
|
|
49
|
+
*.env
|
|
50
|
+
|
|
51
|
+
# -------------------------------------------------------
|
|
52
|
+
# Mac / Windows
|
|
53
|
+
# -------------------------------------------------------
|
|
54
|
+
.DS_Store
|
|
55
|
+
Thumbs.db
|
|
56
|
+
desktop.ini
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: poma
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Official Python SDK for the Poma document-processing API
|
|
5
5
|
Author-email: "POMA AI GmbH, Berlin" <sdk@poma-ai.com>
|
|
6
6
|
License-Expression: MPL-2.0
|
|
@@ -40,15 +40,15 @@ pip install poma
|
|
|
40
40
|
|
|
41
41
|
For integrations into LangChain and LlamaIndex:
|
|
42
42
|
```bash
|
|
43
|
-
pip install poma[integrations]
|
|
43
|
+
pip install 'poma[integrations]'
|
|
44
44
|
# Or LangChain/LlamaIndex including example extras:
|
|
45
|
-
pip install poma[integration-examples]
|
|
45
|
+
pip install 'poma[integration-examples]'
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
- You may also want: `pip install python-dotenv` to load API keys from a .env file.
|
|
50
50
|
- API keys required (POMA_API_KEY) for the POMA AI client via environment variables.
|
|
51
|
-
- **To request a POMA_API_KEY, please contact us at
|
|
51
|
+
- **To request a POMA_API_KEY, please contact us at sdk@poma-ai.com**
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
### Example Implementations — all examples, integrations, and additional information can be found in our GitHub repository: [poma-ai/poma](https://github.com/poma-ai/)
|
|
@@ -12,15 +12,15 @@ pip install poma
|
|
|
12
12
|
|
|
13
13
|
For integrations into LangChain and LlamaIndex:
|
|
14
14
|
```bash
|
|
15
|
-
pip install poma[integrations]
|
|
15
|
+
pip install 'poma[integrations]'
|
|
16
16
|
# Or LangChain/LlamaIndex including example extras:
|
|
17
|
-
pip install poma[integration-examples]
|
|
17
|
+
pip install 'poma[integration-examples]'
|
|
18
18
|
```
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
- You may also want: `pip install python-dotenv` to load API keys from a .env file.
|
|
22
22
|
- API keys required (POMA_API_KEY) for the POMA AI client via environment variables.
|
|
23
|
-
- **To request a POMA_API_KEY, please contact us at
|
|
23
|
+
- **To request a POMA_API_KEY, please contact us at sdk@poma-ai.com**
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
### Example Implementations — all examples, integrations, and additional information can be found in our GitHub repository: [poma-ai/poma](https://github.com/poma-ai/)
|
|
@@ -8,18 +8,15 @@ import time
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
|
-
from poma.exceptions import
|
|
12
|
-
|
|
11
|
+
from poma.exceptions import (
|
|
12
|
+
AuthenticationError,
|
|
13
|
+
RemoteServerError,
|
|
14
|
+
InvalidResponseError,
|
|
15
|
+
)
|
|
16
|
+
from poma.retrieval import generate_cheatsheets
|
|
13
17
|
|
|
14
|
-
USER_AGENT = "poma-ai-sdk/0.1.0"
|
|
15
18
|
|
|
16
|
-
|
|
17
|
-
".txt",
|
|
18
|
-
".md",
|
|
19
|
-
".html",
|
|
20
|
-
".htm",
|
|
21
|
-
".pdf",
|
|
22
|
-
}
|
|
19
|
+
USER_AGENT = "poma-ai-sdk/0.1.0"
|
|
23
20
|
|
|
24
21
|
API_BASE_URL = "https://api.poma-ai.com/api/v1"
|
|
25
22
|
|
|
@@ -51,10 +48,13 @@ class Poma:
|
|
|
51
48
|
# Override API base URL if environment variable is set
|
|
52
49
|
if os.environ.get("API_BASE_URL"):
|
|
53
50
|
api_base_url = os.environ.get("API_BASE_URL")
|
|
51
|
+
if not api_base_url:
|
|
52
|
+
raise ValueError("API base URL cannot be empty.")
|
|
54
53
|
|
|
55
54
|
self.base_api_url = api_base_url.rstrip("/")
|
|
56
|
-
self._client = client or httpx.Client(
|
|
57
|
-
|
|
55
|
+
self._client = client or httpx.Client(
|
|
56
|
+
timeout=timeout, headers={"user-agent": USER_AGENT}
|
|
57
|
+
)
|
|
58
58
|
if not (api_key := api_key or os.environ.get("POMA_API_KEY", "")):
|
|
59
59
|
raise Exception("POMA_API_KEY environment variable not set.")
|
|
60
60
|
self._client.headers.update({"Authorization": f"Bearer {api_key}"})
|
|
@@ -77,17 +77,12 @@ class Poma:
|
|
|
77
77
|
"""
|
|
78
78
|
if not file_path or not isinstance(file_path, os.PathLike):
|
|
79
79
|
raise ValueError("file_path must be a non-empty os.PathLike.")
|
|
80
|
-
file_extension = Path(file_path).suffix.lower()
|
|
81
|
-
if file_extension not in ALLOWED_FILE_EXTENSIONS:
|
|
82
|
-
raise InvalidInputError(
|
|
83
|
-
f"File extension of {file_path} is not allowed; use one of the following types: {', '.join(sorted(ALLOWED_FILE_EXTENSIONS))}."
|
|
84
|
-
)
|
|
85
80
|
payload = {}
|
|
86
81
|
if base_url:
|
|
87
82
|
payload["base_url"] = base_url
|
|
88
83
|
try:
|
|
89
84
|
response = self._client.post(
|
|
90
|
-
f"{self.base_api_url}/
|
|
85
|
+
f"{self.base_api_url}/ingest",
|
|
91
86
|
data=payload,
|
|
92
87
|
files={
|
|
93
88
|
"file": (Path(file_path).name, Path(file_path).read_bytes()),
|
|
@@ -97,12 +92,18 @@ class Poma:
|
|
|
97
92
|
except httpx.HTTPStatusError as error:
|
|
98
93
|
status = error.response.status_code
|
|
99
94
|
if status in (401, 403):
|
|
100
|
-
raise AuthenticationError(
|
|
101
|
-
|
|
95
|
+
raise AuthenticationError(
|
|
96
|
+
f"Failed to submit file '{file_path}': authentication error"
|
|
97
|
+
) from error
|
|
98
|
+
raise RemoteServerError(
|
|
99
|
+
f"Failed to submit file '{file_path}': {status}"
|
|
100
|
+
) from error
|
|
102
101
|
try:
|
|
103
102
|
data = response.json()
|
|
104
103
|
except ValueError as error:
|
|
105
|
-
raise InvalidResponseError(
|
|
104
|
+
raise InvalidResponseError(
|
|
105
|
+
"Server returned non-JSON or empty body"
|
|
106
|
+
) from error
|
|
106
107
|
return data
|
|
107
108
|
|
|
108
109
|
def get_chunk_result(
|
|
@@ -149,19 +150,25 @@ class Poma:
|
|
|
149
150
|
download = data.get("download", {})
|
|
150
151
|
download_url = download.get("download_url", "")
|
|
151
152
|
if not download_url:
|
|
152
|
-
raise RuntimeError(
|
|
153
|
+
raise RuntimeError(
|
|
154
|
+
"Failed to receive download URL from server."
|
|
155
|
+
)
|
|
153
156
|
|
|
154
157
|
if download_dir is None:
|
|
155
158
|
# Return bytes content instead of saving to file
|
|
156
159
|
file_bytes = self.download_bytes(download_url)
|
|
157
|
-
return self.extract_chunks_and_chunksets_from_poma_archive(
|
|
160
|
+
return self.extract_chunks_and_chunksets_from_poma_archive(
|
|
161
|
+
poma_archive_data=file_bytes
|
|
162
|
+
)
|
|
158
163
|
else:
|
|
159
164
|
# Save downloaded file to directory
|
|
160
165
|
filename = download.get("filename", "downloaded_file.poma")
|
|
161
|
-
downloaded_file_path = self.download_file(
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
return self.extract_chunks_and_chunksets_from_poma_archive(
|
|
166
|
+
downloaded_file_path = self.download_file(
|
|
167
|
+
download_url, filename, save_directory=download_dir
|
|
168
|
+
)
|
|
169
|
+
return self.extract_chunks_and_chunksets_from_poma_archive(
|
|
170
|
+
poma_archive_path=downloaded_file_path
|
|
171
|
+
)
|
|
165
172
|
elif status == "failed":
|
|
166
173
|
error_code = data.get("code", "unknown")
|
|
167
174
|
error_details = data.get("error", "No details provided.")
|
|
@@ -197,33 +204,29 @@ class Poma:
|
|
|
197
204
|
Returns:
|
|
198
205
|
dict: A dictionary containing the chunks and chunksets.
|
|
199
206
|
"""
|
|
200
|
-
# Sanity check for parameters
|
|
201
|
-
if not poma_archive_data and not poma_archive_path:
|
|
202
|
-
raise ValueError("Either poma_archive_data or poma_archive_path must be provided.")
|
|
203
207
|
|
|
204
208
|
# Load the chunks and chunksets from POMA archive
|
|
205
209
|
chunks = None
|
|
206
210
|
chunksets = None
|
|
207
211
|
if poma_archive_path:
|
|
208
212
|
with zipfile.ZipFile(poma_archive_path, "r") as zip_ref:
|
|
209
|
-
chunks = zip_ref.read(
|
|
210
|
-
chunksets = zip_ref.read(
|
|
211
|
-
|
|
213
|
+
chunks = zip_ref.read("chunks.json")
|
|
214
|
+
chunksets = zip_ref.read("chunksets.json")
|
|
215
|
+
elif poma_archive_data:
|
|
212
216
|
with zipfile.ZipFile(io.BytesIO(poma_archive_data), "r") as zip_ref:
|
|
213
|
-
chunks = zip_ref.read(
|
|
214
|
-
chunksets = zip_ref.read(
|
|
217
|
+
chunks = zip_ref.read("chunks.json")
|
|
218
|
+
chunksets = zip_ref.read("chunksets.json")
|
|
219
|
+
else:
|
|
220
|
+
raise ValueError(
|
|
221
|
+
"Either poma_archive_data or poma_archive_path must be provided."
|
|
222
|
+
)
|
|
215
223
|
|
|
216
224
|
# Sanity check
|
|
217
225
|
if not chunks or not chunksets:
|
|
218
|
-
raise KeyError(
|
|
219
|
-
"Result must contain 'chunks' and 'chunksets' keys."
|
|
220
|
-
)
|
|
226
|
+
raise KeyError("Result must contain 'chunks' and 'chunksets' keys.")
|
|
221
227
|
|
|
222
228
|
# Load the chunks and chunksets
|
|
223
|
-
json_result = {
|
|
224
|
-
"chunks": json.loads(chunks),
|
|
225
|
-
"chunksets": json.loads(chunksets)
|
|
226
|
-
}
|
|
229
|
+
json_result = {"chunks": json.loads(chunks), "chunksets": json.loads(chunksets)}
|
|
227
230
|
return json_result
|
|
228
231
|
|
|
229
232
|
def create_cheatsheet(
|
|
@@ -234,14 +237,24 @@ class Poma:
|
|
|
234
237
|
"""
|
|
235
238
|
Generates a single cheatsheet for one single document
|
|
236
239
|
from relevant chunksets (relevant for a certain query)
|
|
237
|
-
and from all
|
|
240
|
+
and from all chunks of that document (providing the textual content).
|
|
238
241
|
Args:
|
|
239
242
|
relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
|
|
240
|
-
all_chunks (list[dict]): A list of all
|
|
243
|
+
all_chunks (list[dict]): A list of all chunk dictionaries of the same document, each representing a chunk of content.
|
|
241
244
|
Returns:
|
|
242
245
|
str: The textual content of the generated cheatsheet.
|
|
243
246
|
"""
|
|
244
|
-
|
|
247
|
+
cheatsheets = generate_cheatsheets(relevant_chunksets, all_chunks)
|
|
248
|
+
if (
|
|
249
|
+
not cheatsheets
|
|
250
|
+
or not isinstance(cheatsheets, list)
|
|
251
|
+
or len(cheatsheets) == 0
|
|
252
|
+
or "content" not in cheatsheets[0]
|
|
253
|
+
):
|
|
254
|
+
raise Exception(
|
|
255
|
+
"Unknown error; cheatsheet could not be created from input chunks."
|
|
256
|
+
)
|
|
257
|
+
return cheatsheets[0]["content"]
|
|
245
258
|
|
|
246
259
|
def create_cheatsheets(
|
|
247
260
|
self,
|
|
@@ -250,14 +263,14 @@ class Poma:
|
|
|
250
263
|
) -> list[dict[str, Any]]:
|
|
251
264
|
"""
|
|
252
265
|
Generates cheatsheets from relevant chunksets (relevant for a certain query)
|
|
253
|
-
and from all
|
|
254
|
-
One cheatsheet is created for each document
|
|
266
|
+
and from all the chunks of all affected documents (providing the textual content).
|
|
267
|
+
One cheatsheet is created for each document found in the chunks (tagged with file_id).
|
|
255
268
|
Args:
|
|
256
269
|
relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
|
|
257
|
-
all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
|
|
270
|
+
all_chunks (list[dict]): A list of all available chunk dictionaries of affected documents, each representing a chunk of content.
|
|
258
271
|
Returns:
|
|
259
272
|
list[dict]: A list of dictionaries representing the generated cheatsheets, each containing:
|
|
260
|
-
- '
|
|
273
|
+
- 'file_id': The tag associated with the respective document.
|
|
261
274
|
- 'content': The textual content of the generated cheatsheet.
|
|
262
275
|
"""
|
|
263
276
|
return generate_cheatsheets(relevant_chunksets, all_chunks)
|
|
@@ -283,7 +296,7 @@ class Poma:
|
|
|
283
296
|
"""
|
|
284
297
|
if not download_url:
|
|
285
298
|
raise ValueError("download_url cannot be empty")
|
|
286
|
-
|
|
299
|
+
|
|
287
300
|
# Determine filename
|
|
288
301
|
if not filename:
|
|
289
302
|
filename = Path(download_url).name or "downloaded_file"
|
|
@@ -303,7 +316,8 @@ class Poma:
|
|
|
303
316
|
# Save the file
|
|
304
317
|
with open(save_path, "wb") as f:
|
|
305
318
|
f.write(content)
|
|
306
|
-
|
|
319
|
+
|
|
320
|
+
return str(save_path)
|
|
307
321
|
|
|
308
322
|
def download_bytes(
|
|
309
323
|
self,
|
|
@@ -319,27 +333,27 @@ class Poma:
|
|
|
319
333
|
"""
|
|
320
334
|
if not download_url:
|
|
321
335
|
raise ValueError("download_url cannot be empty")
|
|
322
|
-
|
|
336
|
+
|
|
323
337
|
# Construct the full URL if it's a relative path
|
|
324
338
|
if download_url.startswith("/"):
|
|
325
339
|
full_url = f"{self.base_api_url}{download_url}"
|
|
326
340
|
else:
|
|
327
341
|
full_url = download_url
|
|
328
|
-
|
|
342
|
+
|
|
329
343
|
print("Downloading file from:", full_url)
|
|
330
344
|
try:
|
|
331
|
-
# Download the file
|
|
332
345
|
response = self._client.get(full_url)
|
|
333
346
|
response.raise_for_status()
|
|
334
|
-
|
|
335
|
-
# Return the bytes content
|
|
336
347
|
return response.content
|
|
337
|
-
|
|
338
348
|
except httpx.HTTPStatusError as error:
|
|
339
349
|
status = error.response.status_code
|
|
340
350
|
if status in (401, 403):
|
|
341
|
-
raise AuthenticationError(
|
|
342
|
-
|
|
351
|
+
raise AuthenticationError(
|
|
352
|
+
f"Failed to download '{download_url}': authentication error"
|
|
353
|
+
) from error
|
|
354
|
+
raise RemoteServerError(
|
|
355
|
+
f"Failed to download '{download_url}': {status}"
|
|
356
|
+
) from error
|
|
343
357
|
except Exception as error:
|
|
344
358
|
raise RuntimeError(f"File download failed: {error}") from error
|
|
345
359
|
|
|
@@ -16,5 +16,6 @@ class RemoteServerError(PomaSDKError):
|
|
|
16
16
|
class InvalidInputError(PomaSDKError):
|
|
17
17
|
"""Raised when an unsupported *Content‑Type* is given to ``chunk_text``."""
|
|
18
18
|
|
|
19
|
+
|
|
19
20
|
class InvalidResponseError(PomaSDKError):
|
|
20
21
|
"""Raised when the server returns non-JSON or empty body."""
|
|
@@ -18,9 +18,8 @@ from langchain_text_splitters import TextSplitter
|
|
|
18
18
|
from pydantic import Field, PrivateAttr
|
|
19
19
|
|
|
20
20
|
from poma import Poma
|
|
21
|
-
from poma.client import ALLOWED_FILE_EXTENSIONS
|
|
22
21
|
from poma.exceptions import InvalidInputError
|
|
23
|
-
from poma.retrieval import _cheatsheets_from_chunks
|
|
22
|
+
from poma.retrieval import chunks_from_dicts, _cheatsheets_from_chunks
|
|
24
23
|
|
|
25
24
|
__all__ = ["PomaFileLoader", "PomaChunksetSplitter", "PomaCheatsheetRetrieverLC"]
|
|
26
25
|
|
|
@@ -52,10 +51,6 @@ class PomaFileLoader(BaseLoader):
|
|
|
52
51
|
nonlocal skipped, documents
|
|
53
52
|
if not file_path.is_file():
|
|
54
53
|
return
|
|
55
|
-
file_extension = file_path.suffix.lower()
|
|
56
|
-
if not file_extension or file_extension not in ALLOWED_FILE_EXTENSIONS:
|
|
57
|
-
skipped += 1
|
|
58
|
-
return
|
|
59
54
|
file_bytes = file_path.read_bytes()
|
|
60
55
|
file_hash = hashlib.md5(file_bytes).hexdigest()
|
|
61
56
|
if file_path.suffix.lower() == ".pdf":
|
|
@@ -84,13 +79,10 @@ class PomaFileLoader(BaseLoader):
|
|
|
84
79
|
else:
|
|
85
80
|
raise FileNotFoundError(f"Unsupported path type (not file/dir): {path}")
|
|
86
81
|
|
|
87
|
-
allowed = ", ".join(sorted(ALLOWED_FILE_EXTENSIONS))
|
|
88
82
|
if not documents:
|
|
89
|
-
raise InvalidInputError(f"No supported files found.
|
|
83
|
+
raise InvalidInputError(f"No supported files found.")
|
|
90
84
|
if skipped > 0:
|
|
91
|
-
print(
|
|
92
|
-
f"Skipped {skipped} file(s) due to unsupported or unreadable type. Allowed: {allowed}"
|
|
93
|
-
)
|
|
85
|
+
print(f"Skipped {skipped} file(s) due to unsupported or unreadable type.")
|
|
94
86
|
return documents
|
|
95
87
|
|
|
96
88
|
|
|
@@ -330,7 +322,7 @@ class PomaCheatsheetRetrieverLC(BaseRetriever):
|
|
|
330
322
|
|
|
331
323
|
def _create_cheatsheet_langchain(self, chunked_docs: list[Document]) -> str:
|
|
332
324
|
"""Generate a single deduplicated cheatsheet from chunked documents."""
|
|
333
|
-
|
|
325
|
+
all_chunk_dicts = []
|
|
334
326
|
seen = set()
|
|
335
327
|
for doc in chunked_docs:
|
|
336
328
|
doc_id = doc.metadata.get("doc_id", "unknown_doc")
|
|
@@ -341,11 +333,9 @@ class PomaCheatsheetRetrieverLC(BaseRetriever):
|
|
|
341
333
|
if chunk_index not in seen:
|
|
342
334
|
seen.add(chunk_index)
|
|
343
335
|
chunk["tag"] = doc_id
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
)
|
|
348
|
-
cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
|
|
336
|
+
all_chunk_dicts.append(chunk)
|
|
337
|
+
all_chunks = chunks_from_dicts(all_chunk_dicts)
|
|
338
|
+
cheatsheets = _cheatsheets_from_chunks(all_chunks)
|
|
349
339
|
if (
|
|
350
340
|
not cheatsheets
|
|
351
341
|
or not isinstance(cheatsheets, list)
|
|
@@ -23,9 +23,8 @@ from llama_index.core.schema import (
|
|
|
23
23
|
from pydantic import PrivateAttr
|
|
24
24
|
|
|
25
25
|
from poma import Poma
|
|
26
|
-
from poma.client import ALLOWED_FILE_EXTENSIONS
|
|
27
|
-
from poma.retrieval import _cheatsheets_from_chunks
|
|
28
26
|
from poma.exceptions import InvalidInputError
|
|
27
|
+
from poma.retrieval import chunks_from_dicts, _cheatsheets_from_chunks
|
|
29
28
|
|
|
30
29
|
__all__ = ["PomaFileReader", "PomaChunksetNodeParser", "PomaCheatsheetRetrieverLI"]
|
|
31
30
|
|
|
@@ -54,9 +53,6 @@ class PomaFileReader(BaseReader):
|
|
|
54
53
|
if not file_path.is_file():
|
|
55
54
|
return
|
|
56
55
|
file_extension = file_path.suffix.lower()
|
|
57
|
-
if not file_extension or file_extension not in ALLOWED_FILE_EXTENSIONS:
|
|
58
|
-
skipped += 1
|
|
59
|
-
return
|
|
60
56
|
file_bytes = file_path.read_bytes()
|
|
61
57
|
file_hash = hashlib.md5(file_bytes).hexdigest()
|
|
62
58
|
if file_extension == ".pdf":
|
|
@@ -87,13 +83,10 @@ class PomaFileReader(BaseReader):
|
|
|
87
83
|
else:
|
|
88
84
|
raise FileNotFoundError(f"Unsupported path type (not file/dir): {path}")
|
|
89
85
|
|
|
90
|
-
allowed = ", ".join(sorted(ALLOWED_FILE_EXTENSIONS))
|
|
91
86
|
if not documents:
|
|
92
|
-
raise InvalidInputError(f"No supported files found.
|
|
87
|
+
raise InvalidInputError(f"No supported files found.")
|
|
93
88
|
if skipped > 0:
|
|
94
|
-
print(
|
|
95
|
-
f"Skipped {skipped} file(s) due to unsupported or unreadable type. Allowed: {allowed}"
|
|
96
|
-
)
|
|
89
|
+
print(f"Skipped {skipped} file(s) due to unsupported or unreadable type.")
|
|
97
90
|
return documents
|
|
98
91
|
|
|
99
92
|
|
|
@@ -329,7 +322,7 @@ class PomaCheatsheetRetrieverLI(BaseRetriever):
|
|
|
329
322
|
|
|
330
323
|
def _create_cheatsheet_llamaindex(self, chunked_nodes: list[NodeWithScore]) -> str:
|
|
331
324
|
"""Generate a single deduplicated cheatsheet from chunked nodes."""
|
|
332
|
-
|
|
325
|
+
all_chunk_dicts = []
|
|
333
326
|
seen = set()
|
|
334
327
|
for node in chunked_nodes:
|
|
335
328
|
doc_id = node.metadata.get("doc_id", "unknown_doc")
|
|
@@ -344,11 +337,9 @@ class PomaCheatsheetRetrieverLI(BaseRetriever):
|
|
|
344
337
|
continue
|
|
345
338
|
seen.add(chunk_index)
|
|
346
339
|
chunk["tag"] = doc_id
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
)
|
|
351
|
-
cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
|
|
340
|
+
all_chunk_dicts.append(chunk)
|
|
341
|
+
all_chunks = chunks_from_dicts(all_chunk_dicts)
|
|
342
|
+
cheatsheets = _cheatsheets_from_chunks(all_chunks)
|
|
352
343
|
if (
|
|
353
344
|
not cheatsheets
|
|
354
345
|
or not isinstance(cheatsheets, list)
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# retrieval.py
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from itertools import chain
|
|
4
|
+
from typing import Any
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def deprecated(replacement: str):
|
|
9
|
+
def decorator(func):
|
|
10
|
+
msg = (
|
|
11
|
+
f"{func.__name__}() is deprecated and will be removed in a future version. "
|
|
12
|
+
f"Use {replacement} instead."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
def wrapper(*args, **kwargs):
|
|
16
|
+
warnings.warn(msg, DeprecationWarning, stacklevel=2)
|
|
17
|
+
return func(*args, **kwargs)
|
|
18
|
+
|
|
19
|
+
wrapper.__name__ = func.__name__
|
|
20
|
+
wrapper.__doc__ = (func.__doc__ or "") + f"\n\nDEPRECATED: {msg}\n"
|
|
21
|
+
return wrapper
|
|
22
|
+
|
|
23
|
+
return decorator
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def generate_cheatsheets(
|
|
27
|
+
relevant_chunksets: list[dict[str, Any]],
|
|
28
|
+
all_chunks: list[dict[str, Any]],
|
|
29
|
+
) -> list[dict[str, Any]]:
|
|
30
|
+
# get chunks grouped by document file_id
|
|
31
|
+
doc_chunks = defaultdict(list)
|
|
32
|
+
for chunk in all_chunks:
|
|
33
|
+
file_id = chunk.get("file_id") or chunk.get("tag") or "single_doc"
|
|
34
|
+
chunk["file_id"] = file_id # update
|
|
35
|
+
doc_chunks[file_id].append(chunk)
|
|
36
|
+
|
|
37
|
+
# Check for duplicate chunk_index values
|
|
38
|
+
# (necessary when file_id was not set in chunks)
|
|
39
|
+
for file_id, chunks in doc_chunks.items():
|
|
40
|
+
chunk_indices = [c["chunk_index"] for c in chunks]
|
|
41
|
+
if len(chunk_indices) != len(set(chunk_indices)):
|
|
42
|
+
raise ValueError(f"Duplicate chunk_index found for file_id: {file_id}")
|
|
43
|
+
|
|
44
|
+
# get relevant chunksets grouped by document file_id
|
|
45
|
+
relevant_chunksets_per_doc = defaultdict(list)
|
|
46
|
+
for chunkset in relevant_chunksets:
|
|
47
|
+
file_id = chunkset.get("file_id") or chunkset.get("tag") or "single_doc"
|
|
48
|
+
chunkset["file_id"] = file_id # update
|
|
49
|
+
if "chunks" not in chunkset:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"Chunkset not valid; must contain a 'chunks' key with a list of chunk IDs."
|
|
52
|
+
)
|
|
53
|
+
relevant_chunksets_per_doc[file_id].append(chunkset)
|
|
54
|
+
|
|
55
|
+
# Ensure that chunksets and chunks correspond to the same file_ids
|
|
56
|
+
for file_id in relevant_chunksets_per_doc.keys():
|
|
57
|
+
if file_id not in doc_chunks:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"Chunksets contain file_id '{file_id}' which is not present in the chunks."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# retrieve relevant chunks with content per document
|
|
63
|
+
relevant_content_chunks: list[RetrievalChunk] = []
|
|
64
|
+
for file_id, chunksets_per_doc in relevant_chunksets_per_doc.items():
|
|
65
|
+
chunk_ids = list( # flattened list
|
|
66
|
+
chain.from_iterable(chunkset["chunks"] for chunkset in chunksets_per_doc)
|
|
67
|
+
)
|
|
68
|
+
relevant_chunks_dict = _get_relevant_chunks_for_ids(
|
|
69
|
+
chunk_ids, doc_chunks[file_id]
|
|
70
|
+
)
|
|
71
|
+
relevant_chunks: list[RetrievalChunk] = chunks_from_dicts(relevant_chunks_dict)
|
|
72
|
+
relevant_content_chunks.extend(relevant_chunks)
|
|
73
|
+
|
|
74
|
+
return _cheatsheets_from_chunks(relevant_content_chunks)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@deprecated("generate_cheatsheets(relevant_chunksets, all_chunks)")
|
|
78
|
+
def generate_single_cheatsheet(
|
|
79
|
+
relevant_chunksets: list[dict[str, Any]],
|
|
80
|
+
all_chunks: list[dict[str, Any]],
|
|
81
|
+
) -> str:
|
|
82
|
+
cheatsheets = generate_cheatsheets(
|
|
83
|
+
relevant_chunksets=relevant_chunksets,
|
|
84
|
+
all_chunks=all_chunks,
|
|
85
|
+
)
|
|
86
|
+
return cheatsheets[0].get("content", "") if cheatsheets else ""
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
########################
|
|
90
|
+
# RetrievalChunk Class #
|
|
91
|
+
########################
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class RetrievalChunk:
|
|
95
|
+
"""
|
|
96
|
+
Represents a chunk of text with associated metadata.
|
|
97
|
+
Attributes:
|
|
98
|
+
index (int): The index of the chunk within a sequence.
|
|
99
|
+
file_id (str): The id associating the chunk with a document.
|
|
100
|
+
content (str): The textual content of the chunk.
|
|
101
|
+
depth_rebased (int, optional): The hierarchical depth of the chunk content.
|
|
102
|
+
In cheatsheets, this affects indentation for certain text parts.
|
|
103
|
+
Currently only used for code blocks.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
index: int,
|
|
109
|
+
file_id: str,
|
|
110
|
+
content: str,
|
|
111
|
+
depth_rebased: int | None,
|
|
112
|
+
):
|
|
113
|
+
self.index = index
|
|
114
|
+
self.file_id = file_id
|
|
115
|
+
self.content = content
|
|
116
|
+
self.depth_rebased = depth_rebased
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def from_chunk_dict(
|
|
120
|
+
cls,
|
|
121
|
+
chunk_dict: dict,
|
|
122
|
+
block_min_depth: int | None,
|
|
123
|
+
):
|
|
124
|
+
if block_min_depth is not None:
|
|
125
|
+
depth = int(chunk_dict["depth"])
|
|
126
|
+
depth_rebased = cls._rebase_depth(depth, block_min_depth)
|
|
127
|
+
else:
|
|
128
|
+
depth_rebased = None
|
|
129
|
+
return cls(
|
|
130
|
+
index=int(chunk_dict["chunk_index"]),
|
|
131
|
+
file_id=str(
|
|
132
|
+
chunk_dict.get("file_id") or chunk_dict.get("tag") or "single_doc"
|
|
133
|
+
),
|
|
134
|
+
content=str(chunk_dict["content"]),
|
|
135
|
+
depth_rebased=depth_rebased,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def _rebase_depth(depth: int, min_depth: int, base_unit: int = 0) -> int | None:
|
|
140
|
+
rebased = depth - min_depth + base_unit
|
|
141
|
+
return max(0, rebased)
|
|
142
|
+
|
|
143
|
+
def __repr__(self):
|
|
144
|
+
return f"RetrievalChunk(index={self.index}, file_id={self.file_id}, content={self.content}), depth_rebased={self.depth_rebased}"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def chunks_from_dicts(chunk_dicts: list[dict]) -> list[RetrievalChunk]:
|
|
148
|
+
"""
|
|
149
|
+
Converts a list of chunk dictionaries into a list of Chunk objects.
|
|
150
|
+
File_ids are needed to identify chunks from different documents;
|
|
151
|
+
if is_single_doc is True, all chunks are assumed to come from a single document
|
|
152
|
+
and file_id is optional.
|
|
153
|
+
Args:
|
|
154
|
+
chunk_dicts (list[dict]): A list of dictionaries, each representing a chunk with required keys:
|
|
155
|
+
- "chunk_index": The index of the chunk within the document.
|
|
156
|
+
- "file_id": The identifier of the document.
|
|
157
|
+
- "content": The textual content of the chunk.
|
|
158
|
+
- "depth": The depth or level of the chunk.
|
|
159
|
+
Returns:
|
|
160
|
+
list[Chunk]: A list of Chunk objects with the textual content needed for the cheatsheets.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
# Determine the minimum depth per code block
|
|
164
|
+
min_depth_per_code_block: dict[str, int] = {}
|
|
165
|
+
for chunk_dict in chunk_dicts:
|
|
166
|
+
block_id = chunk_dict.get("code")
|
|
167
|
+
if block_id is None:
|
|
168
|
+
continue
|
|
169
|
+
depth = int(chunk_dict["depth"])
|
|
170
|
+
current = min_depth_per_code_block.get(block_id)
|
|
171
|
+
min_depth_per_code_block[block_id] = (
|
|
172
|
+
depth if current is None else min(current, depth)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Create Chunk objects
|
|
176
|
+
all_chunks: list[RetrievalChunk] = []
|
|
177
|
+
for chunk_dict in chunk_dicts:
|
|
178
|
+
code_id = chunk_dict.get("code")
|
|
179
|
+
if bool(code_id):
|
|
180
|
+
block_min_depth = min_depth_per_code_block.get(str(code_id))
|
|
181
|
+
else:
|
|
182
|
+
block_min_depth = None
|
|
183
|
+
chunk = RetrievalChunk.from_chunk_dict(chunk_dict, block_min_depth)
|
|
184
|
+
all_chunks.append(chunk)
|
|
185
|
+
|
|
186
|
+
# Sanity check: Make sure there are no duplicate chunk_index values
|
|
187
|
+
check_dict = defaultdict(set)
|
|
188
|
+
has_duplicates = any(
|
|
189
|
+
chunk.index in check_dict[chunk.file_id]
|
|
190
|
+
or check_dict[chunk.file_id].add(chunk.index)
|
|
191
|
+
for chunk in all_chunks
|
|
192
|
+
)
|
|
193
|
+
if has_duplicates:
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"Duplicate chunk indices found in single document mode. "
|
|
196
|
+
"Each chunk must have a unique index."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
return all_chunks
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
###################
|
|
203
|
+
# Private Methods #
|
|
204
|
+
###################
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _get_relevant_chunks_for_ids(
|
|
208
|
+
chunk_ids: list[int],
|
|
209
|
+
chunks: list[dict[str, Any]],
|
|
210
|
+
) -> list[dict[str, Any]]:
|
|
211
|
+
chunk_indices_of_retrieved_chunksets = chunk_ids
|
|
212
|
+
all_chunks_of_doc = chunks
|
|
213
|
+
|
|
214
|
+
# Build helpers
|
|
215
|
+
sorted_chunks = sorted(all_chunks_of_doc, key=lambda c: c["chunk_index"])
|
|
216
|
+
index_to_chunk = {c["chunk_index"]: c for c in sorted_chunks}
|
|
217
|
+
index_to_depth = {c["chunk_index"]: c["depth"] for c in sorted_chunks}
|
|
218
|
+
|
|
219
|
+
# Find relatively deepest indices in the retrieval
|
|
220
|
+
candidate_indices = set(chunk_indices_of_retrieved_chunksets)
|
|
221
|
+
|
|
222
|
+
def is_ancestor(idx1, idx2):
|
|
223
|
+
"""True if idx1 is an ancestor of idx2."""
|
|
224
|
+
# idx1 must be before idx2 and have smaller depth
|
|
225
|
+
if idx1 >= idx2:
|
|
226
|
+
return False
|
|
227
|
+
depth1 = index_to_depth[idx1]
|
|
228
|
+
depth2 = index_to_depth[idx2]
|
|
229
|
+
if depth1 >= depth2:
|
|
230
|
+
return False
|
|
231
|
+
# scan from idx1+1 up to idx2, making sure all are deeper than depth1 until idx2
|
|
232
|
+
for i in range(idx1 + 1, idx2 + 1):
|
|
233
|
+
depth = index_to_depth[sorted_chunks[i]["chunk_index"]]
|
|
234
|
+
if depth <= depth1 and sorted_chunks[i]["chunk_index"] != idx2:
|
|
235
|
+
return False
|
|
236
|
+
return True
|
|
237
|
+
|
|
238
|
+
# Exclude any index that is an ancestor of another in the set
|
|
239
|
+
relatively_deepest = set(candidate_indices)
|
|
240
|
+
for idx1 in candidate_indices:
|
|
241
|
+
for idx2 in candidate_indices:
|
|
242
|
+
if idx1 != idx2 and is_ancestor(idx1, idx2):
|
|
243
|
+
relatively_deepest.discard(idx1)
|
|
244
|
+
break
|
|
245
|
+
|
|
246
|
+
# Standard subtree/parent finding routines
|
|
247
|
+
def get_child_indices(chunk_index: int) -> list[int]:
|
|
248
|
+
base_depth = index_to_depth[chunk_index]
|
|
249
|
+
children = []
|
|
250
|
+
for i in range(chunk_index + 1, len(sorted_chunks)):
|
|
251
|
+
idx = sorted_chunks[i]["chunk_index"]
|
|
252
|
+
depth = sorted_chunks[i]["depth"]
|
|
253
|
+
if depth <= base_depth:
|
|
254
|
+
break
|
|
255
|
+
children.append(idx)
|
|
256
|
+
return children
|
|
257
|
+
|
|
258
|
+
def get_parent_indices(chunk_index: int) -> list[int]:
|
|
259
|
+
parents = []
|
|
260
|
+
current_depth = index_to_depth[chunk_index]
|
|
261
|
+
for i in range(chunk_index - 1, -1, -1):
|
|
262
|
+
idx = sorted_chunks[i]["chunk_index"]
|
|
263
|
+
depth = sorted_chunks[i]["depth"]
|
|
264
|
+
if depth < current_depth:
|
|
265
|
+
parents.append(idx)
|
|
266
|
+
current_depth = depth
|
|
267
|
+
return parents[::-1] # root -> leaf order
|
|
268
|
+
|
|
269
|
+
# Collect all relevant indices
|
|
270
|
+
all_indices = set(
|
|
271
|
+
chunk_indices_of_retrieved_chunksets
|
|
272
|
+
) # always include all search hits
|
|
273
|
+
for idx in relatively_deepest:
|
|
274
|
+
all_indices.update(get_child_indices(idx))
|
|
275
|
+
|
|
276
|
+
# Parents for all found nodes
|
|
277
|
+
for idx in list(all_indices):
|
|
278
|
+
all_indices.update(get_parent_indices(idx))
|
|
279
|
+
|
|
280
|
+
# Return in doc order
|
|
281
|
+
return [index_to_chunk[i] for i in sorted(all_indices)]
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _cheatsheets_from_chunks(
|
|
285
|
+
content_chunks: list[RetrievalChunk],
|
|
286
|
+
) -> list[dict[str, Any]]:
|
|
287
|
+
if not content_chunks:
|
|
288
|
+
return []
|
|
289
|
+
|
|
290
|
+
if isinstance(content_chunks[0], dict):
|
|
291
|
+
raise ValueError(
|
|
292
|
+
"Input to _cheatsheets_from_chunks must be a list of RetrievalChunk objects, not dicts."
|
|
293
|
+
"Use chunks_from_dicts() to convert dicts to RetrievalChunk objects first."
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def _format_chunk_content(chunk: "RetrievalChunk") -> str:
|
|
297
|
+
if not getattr(chunk, "depth_rebased", False):
|
|
298
|
+
return chunk.content
|
|
299
|
+
else:
|
|
300
|
+
indent = " " * 4 * (chunk.depth_rebased or 0)
|
|
301
|
+
return f"{indent}{chunk.content}"
|
|
302
|
+
|
|
303
|
+
cheatsheets: list[dict] = []
|
|
304
|
+
|
|
305
|
+
compressed_data = {}
|
|
306
|
+
content_chunks = sorted(content_chunks, key=lambda c: (c.file_id, c.index))
|
|
307
|
+
for chunk in content_chunks:
|
|
308
|
+
if chunk.file_id not in compressed_data:
|
|
309
|
+
# If there is data stored for a previous file_id, save it to the cheatsheets list
|
|
310
|
+
if compressed_data:
|
|
311
|
+
for key, value in compressed_data.items():
|
|
312
|
+
cheatsheets.append(
|
|
313
|
+
{"file_id": key, "tag": key, "content": value["content"]}
|
|
314
|
+
)
|
|
315
|
+
# Clear the compressed_data for the current file_id
|
|
316
|
+
compressed_data.clear()
|
|
317
|
+
# Start a new entry for the current file_id
|
|
318
|
+
compressed_data[chunk.file_id] = {
|
|
319
|
+
"content": _format_chunk_content(chunk),
|
|
320
|
+
"last_chunk": chunk.index,
|
|
321
|
+
}
|
|
322
|
+
else:
|
|
323
|
+
chunk_content = _format_chunk_content(chunk)
|
|
324
|
+
# Check if chunks are consecutive
|
|
325
|
+
if chunk.index == int(compressed_data[chunk.file_id]["last_chunk"]) + 1:
|
|
326
|
+
compressed_data[chunk.file_id]["content"] += "\n" + chunk_content
|
|
327
|
+
else:
|
|
328
|
+
compressed_data[chunk.file_id]["content"] += "\n[…]\n" + chunk_content
|
|
329
|
+
# Update the last chunk index
|
|
330
|
+
compressed_data[chunk.file_id]["last_chunk"] = chunk.index
|
|
331
|
+
|
|
332
|
+
# Save the last processed entry to the cheatsheets list
|
|
333
|
+
if compressed_data:
|
|
334
|
+
for key, value in compressed_data.items():
|
|
335
|
+
cheatsheets.append(
|
|
336
|
+
{"file_id": key, "tag": key, "content": value["content"]}
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
return cheatsheets
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: poma
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Official Python SDK for the Poma document-processing API
|
|
5
5
|
Author-email: "POMA AI GmbH, Berlin" <sdk@poma-ai.com>
|
|
6
6
|
License-Expression: MPL-2.0
|
|
@@ -40,15 +40,15 @@ pip install poma
|
|
|
40
40
|
|
|
41
41
|
For integrations into LangChain and LlamaIndex:
|
|
42
42
|
```bash
|
|
43
|
-
pip install poma[integrations]
|
|
43
|
+
pip install 'poma[integrations]'
|
|
44
44
|
# Or LangChain/LlamaIndex including example extras:
|
|
45
|
-
pip install poma[integration-examples]
|
|
45
|
+
pip install 'poma[integration-examples]'
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
- You may also want: `pip install python-dotenv` to load API keys from a .env file.
|
|
50
50
|
- API keys required (POMA_API_KEY) for the POMA AI client via environment variables.
|
|
51
|
-
- **To request a POMA_API_KEY, please contact us at
|
|
51
|
+
- **To request a POMA_API_KEY, please contact us at sdk@poma-ai.com**
|
|
52
52
|
|
|
53
53
|
|
|
54
54
|
### Example Implementations — all examples, integrations, and additional information can be found in our GitHub repository: [poma-ai/poma](https://github.com/poma-ai/)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "poma"
|
|
3
|
-
version = "0.1
|
|
3
|
+
version = "0.2.1"
|
|
4
4
|
description = "Official Python SDK for the Poma document-processing API"
|
|
5
5
|
authors = [{ name = "POMA AI GmbH, Berlin", email = "sdk@poma-ai.com" }]
|
|
6
6
|
readme = "README.md"
|
|
@@ -34,7 +34,7 @@ integration-examples = [
|
|
|
34
34
|
]
|
|
35
35
|
|
|
36
36
|
[build-system]
|
|
37
|
-
requires = ["setuptools>=67", "wheel"]
|
|
37
|
+
requires = ["setuptools>=67", "wheel", "setuptools-scm>=8"]
|
|
38
38
|
build-backend = "setuptools.build_meta"
|
|
39
39
|
|
|
40
40
|
[tool.setuptools.packages.find]
|
poma-0.1.2/poma/retrieval.py
DELETED
|
@@ -1,176 +0,0 @@
|
|
|
1
|
-
# retrieval.py
|
|
2
|
-
from collections import defaultdict
|
|
3
|
-
from itertools import chain
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def generate_cheatsheets(
|
|
8
|
-
relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
|
|
9
|
-
) -> list[dict[str, Any]]:
|
|
10
|
-
chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
|
|
11
|
-
chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
|
|
12
|
-
relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
|
|
13
|
-
sorted_chunks = sorted(
|
|
14
|
-
relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
|
|
15
|
-
)
|
|
16
|
-
return _cheatsheets_from_chunks(sorted_chunks)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def generate_single_cheatsheet(
|
|
20
|
-
relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
|
|
21
|
-
) -> str:
|
|
22
|
-
|
|
23
|
-
def prepare_single_doc_chunks(
|
|
24
|
-
chunk_dicts: list[dict[str, Any]],
|
|
25
|
-
) -> list[dict[str, Any]]:
|
|
26
|
-
# Make sure there are no duplicate chunk_index values
|
|
27
|
-
check_dict = defaultdict(set)
|
|
28
|
-
has_duplicates = any(
|
|
29
|
-
chunk["chunk_index"] in check_dict[chunk["tag"]]
|
|
30
|
-
or check_dict[chunk["tag"]].add(chunk["chunk_index"])
|
|
31
|
-
for chunk in chunk_dicts
|
|
32
|
-
)
|
|
33
|
-
if has_duplicates:
|
|
34
|
-
raise ValueError(
|
|
35
|
-
"Duplicate chunk indices found in single document mode. "
|
|
36
|
-
"Each chunk must have a unique index."
|
|
37
|
-
)
|
|
38
|
-
# Use a fixed tag for chunks from single documents
|
|
39
|
-
for chunk_dict in chunk_dicts:
|
|
40
|
-
chunk_dict["tag"] = "single_doc"
|
|
41
|
-
return chunk_dicts
|
|
42
|
-
|
|
43
|
-
chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
|
|
44
|
-
chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
|
|
45
|
-
relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
|
|
46
|
-
relevant_chunks = prepare_single_doc_chunks(relevant_chunks)
|
|
47
|
-
sorted_chunks = sorted(
|
|
48
|
-
relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
|
|
49
|
-
)
|
|
50
|
-
cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
|
|
51
|
-
if (
|
|
52
|
-
not cheatsheets
|
|
53
|
-
or not isinstance(cheatsheets, list)
|
|
54
|
-
or len(cheatsheets) == 0
|
|
55
|
-
or "content" not in cheatsheets[0]
|
|
56
|
-
):
|
|
57
|
-
raise Exception(
|
|
58
|
-
"Unknown error; cheatsheet could not be created from input chunks."
|
|
59
|
-
)
|
|
60
|
-
return cheatsheets[0]["content"]
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def _get_relevant_chunks_for_ids(
|
|
64
|
-
chunk_ids: list[int],
|
|
65
|
-
chunks: list[dict[str, Any]],
|
|
66
|
-
) -> list[dict[str, Any]]:
|
|
67
|
-
chunk_indices_of_retrieved_chunksets = chunk_ids
|
|
68
|
-
all_chunks_of_doc = chunks
|
|
69
|
-
|
|
70
|
-
# Build helpers
|
|
71
|
-
sorted_chunks = sorted(all_chunks_of_doc, key=lambda c: c["chunk_index"])
|
|
72
|
-
index_to_chunk = {c["chunk_index"]: c for c in sorted_chunks}
|
|
73
|
-
index_to_depth = {c["chunk_index"]: c["depth"] for c in sorted_chunks}
|
|
74
|
-
|
|
75
|
-
# Find relatively deepest indices in the retrieval
|
|
76
|
-
candidate_indices = set(chunk_indices_of_retrieved_chunksets)
|
|
77
|
-
|
|
78
|
-
def is_ancestor(idx1, idx2):
|
|
79
|
-
"""True if idx1 is an ancestor of idx2."""
|
|
80
|
-
# idx1 must be before idx2 and have smaller depth
|
|
81
|
-
if idx1 >= idx2:
|
|
82
|
-
return False
|
|
83
|
-
depth1 = index_to_depth[idx1]
|
|
84
|
-
depth2 = index_to_depth[idx2]
|
|
85
|
-
if depth1 >= depth2:
|
|
86
|
-
return False
|
|
87
|
-
# scan from idx1+1 up to idx2, making sure all are deeper than depth1 until idx2
|
|
88
|
-
for i in range(idx1 + 1, idx2 + 1):
|
|
89
|
-
depth = index_to_depth[sorted_chunks[i]["chunk_index"]]
|
|
90
|
-
if depth <= depth1 and sorted_chunks[i]["chunk_index"] != idx2:
|
|
91
|
-
return False
|
|
92
|
-
return True
|
|
93
|
-
|
|
94
|
-
# Exclude any index that is an ancestor of another in the set
|
|
95
|
-
relatively_deepest = set(candidate_indices)
|
|
96
|
-
for idx1 in candidate_indices:
|
|
97
|
-
for idx2 in candidate_indices:
|
|
98
|
-
if idx1 != idx2 and is_ancestor(idx1, idx2):
|
|
99
|
-
relatively_deepest.discard(idx1)
|
|
100
|
-
break
|
|
101
|
-
|
|
102
|
-
# Standard subtree/parent finding routines
|
|
103
|
-
def get_child_indices(chunk_index: int) -> list[int]:
|
|
104
|
-
base_depth = index_to_depth[chunk_index]
|
|
105
|
-
children = []
|
|
106
|
-
for i in range(chunk_index + 1, len(sorted_chunks)):
|
|
107
|
-
idx = sorted_chunks[i]["chunk_index"]
|
|
108
|
-
depth = sorted_chunks[i]["depth"]
|
|
109
|
-
if depth <= base_depth:
|
|
110
|
-
break
|
|
111
|
-
children.append(idx)
|
|
112
|
-
return children
|
|
113
|
-
|
|
114
|
-
def get_parent_indices(chunk_index: int) -> list[int]:
|
|
115
|
-
parents = []
|
|
116
|
-
current_depth = index_to_depth[chunk_index]
|
|
117
|
-
for i in range(chunk_index - 1, -1, -1):
|
|
118
|
-
idx = sorted_chunks[i]["chunk_index"]
|
|
119
|
-
depth = sorted_chunks[i]["depth"]
|
|
120
|
-
if depth < current_depth:
|
|
121
|
-
parents.append(idx)
|
|
122
|
-
current_depth = depth
|
|
123
|
-
return parents[::-1] # root -> leaf order
|
|
124
|
-
|
|
125
|
-
# Collect all relevant indices
|
|
126
|
-
all_indices = set(
|
|
127
|
-
chunk_indices_of_retrieved_chunksets
|
|
128
|
-
) # always include all search hits
|
|
129
|
-
for idx in relatively_deepest:
|
|
130
|
-
all_indices.update(get_child_indices(idx))
|
|
131
|
-
|
|
132
|
-
# Parents for all found nodes
|
|
133
|
-
for idx in list(all_indices):
|
|
134
|
-
all_indices.update(get_parent_indices(idx))
|
|
135
|
-
|
|
136
|
-
# Return in doc order
|
|
137
|
-
return [index_to_chunk[i] for i in sorted(all_indices)]
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def _cheatsheets_from_chunks(
|
|
141
|
-
content_chunks: list[dict[str, Any]],
|
|
142
|
-
) -> list[dict[str, Any]]:
|
|
143
|
-
cheatsheets: list[dict] = []
|
|
144
|
-
|
|
145
|
-
compressed_data = {}
|
|
146
|
-
for chunk in content_chunks:
|
|
147
|
-
if chunk["tag"] not in compressed_data:
|
|
148
|
-
# If there is data stored for a previous tag, save it to the cheatsheets list
|
|
149
|
-
if compressed_data:
|
|
150
|
-
for key, value in compressed_data.items():
|
|
151
|
-
cheatsheets.append({"tag": key, "content": value["content"]})
|
|
152
|
-
# Clear the compressed_data for the current tag
|
|
153
|
-
compressed_data.clear()
|
|
154
|
-
# Start a new entry for the current tag
|
|
155
|
-
compressed_data[chunk["tag"]] = {
|
|
156
|
-
"content": chunk["content"],
|
|
157
|
-
"last_chunk": chunk["chunk_index"],
|
|
158
|
-
}
|
|
159
|
-
else:
|
|
160
|
-
# Check if chunks are consecutive
|
|
161
|
-
if (
|
|
162
|
-
chunk["chunk_index"]
|
|
163
|
-
== int(compressed_data[chunk["tag"]]["last_chunk"]) + 1
|
|
164
|
-
):
|
|
165
|
-
compressed_data[chunk["tag"]]["content"] += "\n" + chunk["content"]
|
|
166
|
-
else:
|
|
167
|
-
compressed_data[chunk["tag"]]["content"] += "\n[…]\n" + chunk["content"]
|
|
168
|
-
# Update the last chunk index
|
|
169
|
-
compressed_data[chunk["tag"]]["last_chunk"] = chunk["chunk_index"]
|
|
170
|
-
|
|
171
|
-
# Save the last processed entry to the cheatsheets list
|
|
172
|
-
if compressed_data:
|
|
173
|
-
for key, value in compressed_data.items():
|
|
174
|
-
cheatsheets.append({"tag": key, "content": value["content"]})
|
|
175
|
-
|
|
176
|
-
return cheatsheets
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|