poma 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poma/__init__.py +15 -0
- poma/client.py +353 -0
- poma/exceptions.py +20 -0
- poma/integrations/__init__.py +20 -0
- poma/integrations/langchain_poma.py +358 -0
- poma/integrations/llamaindex_poma.py +361 -0
- poma/retrieval.py +176 -0
- poma-0.0.0.dist-info/METADATA +66 -0
- poma-0.0.0.dist-info/RECORD +12 -0
- poma-0.0.0.dist-info/WHEEL +5 -0
- poma-0.0.0.dist-info/licenses/LICENSE +177 -0
- poma-0.0.0.dist-info/top_level.txt +1 -0
poma/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from .client import Poma
|
|
2
|
+
from .exceptions import (
|
|
3
|
+
PomaSDKError,
|
|
4
|
+
AuthenticationError,
|
|
5
|
+
RemoteServerError,
|
|
6
|
+
InvalidInputError,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Poma",
|
|
11
|
+
"PomaSDKError",
|
|
12
|
+
"AuthenticationError",
|
|
13
|
+
"RemoteServerError",
|
|
14
|
+
"InvalidInputError",
|
|
15
|
+
]
|
poma/client.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
# client.py
|
|
2
|
+
import os
|
|
3
|
+
import httpx
|
|
4
|
+
import zipfile
|
|
5
|
+
import io
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from poma.exceptions import AuthenticationError, InvalidInputError, RemoteServerError, InvalidResponseError
|
|
12
|
+
from poma.retrieval import generate_cheatsheets, generate_single_cheatsheet
|
|
13
|
+
|
|
14
|
+
USER_AGENT = "poma-ai-sdk/0.1.0"
|
|
15
|
+
|
|
16
|
+
ALLOWED_FILE_EXTENSIONS: set[str] = {
|
|
17
|
+
".txt",
|
|
18
|
+
".md",
|
|
19
|
+
".html",
|
|
20
|
+
".htm",
|
|
21
|
+
".pdf",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
API_BASE_URL = "https://api.poma-ai.com/api/v1"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Poma:
|
|
28
|
+
"""
|
|
29
|
+
Client for interacting with the POMA API.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
api_key: str | None = None,
|
|
35
|
+
*,
|
|
36
|
+
timeout: float = 600.0,
|
|
37
|
+
client: httpx.Client | None = None,
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Initialize the POMA client.
|
|
41
|
+
Args:
|
|
42
|
+
api_key (str, optional):
|
|
43
|
+
API key for authenticating with POMA. If not provided,
|
|
44
|
+
the value is read from the environment variable `POMA_API_KEY`.
|
|
45
|
+
timeout (float, default=600.0):
|
|
46
|
+
Timeout (in seconds) for all HTTP requests.
|
|
47
|
+
client (httpx.Client, optional):
|
|
48
|
+
Custom HTTP client. If not provided, a default client is created.
|
|
49
|
+
"""
|
|
50
|
+
api_base_url = API_BASE_URL
|
|
51
|
+
# Override API base URL if environment variable is set
|
|
52
|
+
if os.environ.get("API_BASE_URL"):
|
|
53
|
+
api_base_url = os.environ.get("API_BASE_URL")
|
|
54
|
+
|
|
55
|
+
self.base_api_url = api_base_url.rstrip("/")
|
|
56
|
+
self._client = client or httpx.Client(timeout=timeout,
|
|
57
|
+
headers={"user-agent": USER_AGENT})
|
|
58
|
+
if not (api_key := api_key or os.environ.get("POMA_API_KEY", "")):
|
|
59
|
+
raise Exception("POMA_API_KEY environment variable not set.")
|
|
60
|
+
self._client.headers.update({"Authorization": f"Bearer {api_key}"})
|
|
61
|
+
|
|
62
|
+
def start_chunk_file(
|
|
63
|
+
self,
|
|
64
|
+
file_path: os.PathLike[str],
|
|
65
|
+
*,
|
|
66
|
+
base_url: str | None = None,
|
|
67
|
+
) -> dict[str, Any]:
|
|
68
|
+
"""
|
|
69
|
+
Submit a file with text to POMA for chunking.
|
|
70
|
+
Args:
|
|
71
|
+
file_path (os.PathLike[str]):
|
|
72
|
+
Path to the input file. Must have an allowed file extension.
|
|
73
|
+
base_url (str, optional):
|
|
74
|
+
Optional base URL to resolve relative links within the file.
|
|
75
|
+
Returns:
|
|
76
|
+
A dictionary containing a unique job identifier for the submitted job.
|
|
77
|
+
"""
|
|
78
|
+
if not file_path or not isinstance(file_path, os.PathLike):
|
|
79
|
+
raise ValueError("file_path must be a non-empty os.PathLike.")
|
|
80
|
+
file_extension = Path(file_path).suffix.lower()
|
|
81
|
+
if file_extension not in ALLOWED_FILE_EXTENSIONS:
|
|
82
|
+
raise InvalidInputError(
|
|
83
|
+
f"File extension of {file_path} is not allowed; use one of the following types: {', '.join(sorted(ALLOWED_FILE_EXTENSIONS))}."
|
|
84
|
+
)
|
|
85
|
+
payload = {}
|
|
86
|
+
if base_url:
|
|
87
|
+
payload["base_url"] = base_url
|
|
88
|
+
try:
|
|
89
|
+
response = self._client.post(
|
|
90
|
+
f"{self.base_api_url}/ingest",
|
|
91
|
+
data=payload,
|
|
92
|
+
files={
|
|
93
|
+
"file": (Path(file_path).name, Path(file_path).read_bytes()),
|
|
94
|
+
},
|
|
95
|
+
)
|
|
96
|
+
response.raise_for_status()
|
|
97
|
+
except httpx.HTTPStatusError as error:
|
|
98
|
+
status = error.response.status_code
|
|
99
|
+
if status in (401, 403):
|
|
100
|
+
raise AuthenticationError(response.text) from error
|
|
101
|
+
raise RemoteServerError(f"{status}: {response.text}") from error
|
|
102
|
+
try:
|
|
103
|
+
data = response.json()
|
|
104
|
+
except ValueError as error:
|
|
105
|
+
raise InvalidResponseError("Server returned non-JSON or empty body") from error
|
|
106
|
+
return data
|
|
107
|
+
|
|
108
|
+
def get_chunk_result(
|
|
109
|
+
self,
|
|
110
|
+
job_id: str,
|
|
111
|
+
*,
|
|
112
|
+
initial_delay: float = 5.0,
|
|
113
|
+
poll_interval: float = 3.0,
|
|
114
|
+
max_interval: float = 15.0,
|
|
115
|
+
show_progress: bool = False,
|
|
116
|
+
download_dir: str | os.PathLike[str] | None = None,
|
|
117
|
+
) -> dict[str, Any]:
|
|
118
|
+
"""
|
|
119
|
+
Poll POMA for the result of a chunking job until completion.
|
|
120
|
+
Args:
|
|
121
|
+
job_id (str):
|
|
122
|
+
The unique identifier of the submitted job.
|
|
123
|
+
initial_delay (float, default=5.0):
|
|
124
|
+
Initial delay (in seconds) before the first poll request.
|
|
125
|
+
poll_interval (float, default=1.0):
|
|
126
|
+
Starting interval (in seconds) between polling requests.
|
|
127
|
+
max_interval (float, default=15.0):
|
|
128
|
+
Maximum interval (in seconds) between polling requests.
|
|
129
|
+
show_progress (bool, default=False):
|
|
130
|
+
If True, logs progress messages during polling.
|
|
131
|
+
download_dir (str | os.PathLike[str], optional):
|
|
132
|
+
Directory to save the downloaded file in. Required if return_bytes=False.
|
|
133
|
+
return_bytes (bool, default=False):
|
|
134
|
+
If True, returns the file content as bytes instead of saving to disk.
|
|
135
|
+
Returns:
|
|
136
|
+
The JSON result containing at least the keys `chunks` and `chunksets`.
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
time.sleep(initial_delay)
|
|
140
|
+
current_interval = poll_interval
|
|
141
|
+
while True:
|
|
142
|
+
time.sleep(current_interval)
|
|
143
|
+
try:
|
|
144
|
+
response = self._client.get(f"{self.base_api_url}/jobs/{job_id}/status")
|
|
145
|
+
response.raise_for_status()
|
|
146
|
+
data = response.json()
|
|
147
|
+
status = data.get("status", "")
|
|
148
|
+
if status == "done":
|
|
149
|
+
download = data.get("download", {})
|
|
150
|
+
download_url = download.get("download_url", "")
|
|
151
|
+
if not download_url:
|
|
152
|
+
raise RuntimeError("Failed to receive download URL from server.")
|
|
153
|
+
|
|
154
|
+
if download_dir is None:
|
|
155
|
+
# Return bytes content instead of saving to file
|
|
156
|
+
file_bytes = self.download_bytes(download_url)
|
|
157
|
+
return self.extract_chunks_and_chunksets_from_poma_archive(poma_archive_data=file_bytes)
|
|
158
|
+
else:
|
|
159
|
+
# Save downloaded file to directory
|
|
160
|
+
filename = download.get("filename", "downloaded_file.poma")
|
|
161
|
+
downloaded_file_path = self.download_file(download_url,
|
|
162
|
+
filename,
|
|
163
|
+
save_directory=download_dir)
|
|
164
|
+
return self.extract_chunks_and_chunksets_from_poma_archive(poma_archive_path=downloaded_file_path)
|
|
165
|
+
elif status == "failed":
|
|
166
|
+
error_code = data.get("code", "unknown")
|
|
167
|
+
error_details = data.get("error", "No details provided.")
|
|
168
|
+
error_message = (
|
|
169
|
+
f"Job failed with code {error_code}: {error_details}"
|
|
170
|
+
)
|
|
171
|
+
raise RemoteServerError(
|
|
172
|
+
f"Job failed: {data.get('error', error_message)}"
|
|
173
|
+
)
|
|
174
|
+
elif status == "processing":
|
|
175
|
+
if show_progress:
|
|
176
|
+
print(f"Job {job_id} is still processing...")
|
|
177
|
+
current_interval = min(current_interval * 1.5, max_interval)
|
|
178
|
+
else:
|
|
179
|
+
raise InvalidResponseError(f"Unexpected job status: {status}")
|
|
180
|
+
except httpx.HTTPStatusError as error:
|
|
181
|
+
raise RemoteServerError(
|
|
182
|
+
f"HTTP error: {error.response.status_code} {error.response.text}"
|
|
183
|
+
) from error
|
|
184
|
+
except Exception as error:
|
|
185
|
+
raise RuntimeError(f"POMA-AI job polling failed: {error}") from error
|
|
186
|
+
|
|
187
|
+
def extract_chunks_and_chunksets_from_poma_archive(
|
|
188
|
+
self,
|
|
189
|
+
poma_archive_data: bytes | None = None,
|
|
190
|
+
poma_archive_path: str | os.PathLike[str] | None = None,
|
|
191
|
+
) -> dict[str, Any]:
|
|
192
|
+
"""
|
|
193
|
+
Extract POMA archive file.
|
|
194
|
+
POMA archive file is a zip file containing the chunks.json and chunksets.json files.
|
|
195
|
+
Args:
|
|
196
|
+
poma_archive (bytes): The POMA archive file.
|
|
197
|
+
Returns:
|
|
198
|
+
dict: A dictionary containing the chunks and chunksets.
|
|
199
|
+
"""
|
|
200
|
+
# Sanity check for parameters
|
|
201
|
+
if not poma_archive_data and not poma_archive_path:
|
|
202
|
+
raise ValueError("Either poma_archive_data or poma_archive_path must be provided.")
|
|
203
|
+
|
|
204
|
+
# Load the chunks and chunksets from POMA archive
|
|
205
|
+
chunks = None
|
|
206
|
+
chunksets = None
|
|
207
|
+
if poma_archive_path:
|
|
208
|
+
with zipfile.ZipFile(poma_archive_path, "r") as zip_ref:
|
|
209
|
+
chunks = zip_ref.read('chunks.json')
|
|
210
|
+
chunksets = zip_ref.read('chunksets.json')
|
|
211
|
+
else:
|
|
212
|
+
with zipfile.ZipFile(io.BytesIO(poma_archive_data), "r") as zip_ref:
|
|
213
|
+
chunks = zip_ref.read('chunks.json')
|
|
214
|
+
chunksets = zip_ref.read('chunksets.json')
|
|
215
|
+
|
|
216
|
+
# Sanity check
|
|
217
|
+
if not chunks or not chunksets:
|
|
218
|
+
raise KeyError(
|
|
219
|
+
"Result must contain 'chunks' and 'chunksets' keys."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# Load the chunks and chunksets
|
|
223
|
+
json_result = {
|
|
224
|
+
"chunks": json.loads(chunks),
|
|
225
|
+
"chunksets": json.loads(chunksets)
|
|
226
|
+
}
|
|
227
|
+
return json_result
|
|
228
|
+
|
|
229
|
+
def create_cheatsheet(
|
|
230
|
+
self,
|
|
231
|
+
relevant_chunksets: list[dict[str, Any]],
|
|
232
|
+
all_chunks: list[dict[str, Any]],
|
|
233
|
+
) -> str:
|
|
234
|
+
"""
|
|
235
|
+
Generates a single cheatsheet for one single document
|
|
236
|
+
from relevant chunksets (relevant for a certain query)
|
|
237
|
+
and from all available chunks (which must contain the textual content).
|
|
238
|
+
Args:
|
|
239
|
+
relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
|
|
240
|
+
all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
|
|
241
|
+
Returns:
|
|
242
|
+
str: The textual content of the generated cheatsheet.
|
|
243
|
+
"""
|
|
244
|
+
return generate_single_cheatsheet(relevant_chunksets, all_chunks)
|
|
245
|
+
|
|
246
|
+
def create_cheatsheets(
|
|
247
|
+
self,
|
|
248
|
+
relevant_chunksets: list[dict[str, Any]],
|
|
249
|
+
all_chunks: list[dict[str, Any]],
|
|
250
|
+
) -> list[dict[str, Any]]:
|
|
251
|
+
"""
|
|
252
|
+
Generates cheatsheets from relevant chunksets (relevant for a certain query)
|
|
253
|
+
and from all available chunks (which must contain the textual content).
|
|
254
|
+
One cheatsheet is created for each document tag found in the chunks.
|
|
255
|
+
Args:
|
|
256
|
+
relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
|
|
257
|
+
all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
|
|
258
|
+
Returns:
|
|
259
|
+
list[dict]: A list of dictionaries representing the generated cheatsheets, each containing:
|
|
260
|
+
- 'tag': The tag associated with the respective document.
|
|
261
|
+
- 'content': The textual content of the generated cheatsheet.
|
|
262
|
+
"""
|
|
263
|
+
return generate_cheatsheets(relevant_chunksets, all_chunks)
|
|
264
|
+
|
|
265
|
+
def download_file(
|
|
266
|
+
self,
|
|
267
|
+
download_url: str,
|
|
268
|
+
filename: str | None = None,
|
|
269
|
+
*,
|
|
270
|
+
save_directory: str | os.PathLike[str] | None = None,
|
|
271
|
+
) -> str:
|
|
272
|
+
"""
|
|
273
|
+
Download a file from the given download URL.
|
|
274
|
+
Args:
|
|
275
|
+
download_url (str):
|
|
276
|
+
The URL to download the file from.
|
|
277
|
+
filename (str, optional):
|
|
278
|
+
The filename to save the file as. If not provided, will be extracted from URL.
|
|
279
|
+
save_directory (str | os.PathLike[str], optional):
|
|
280
|
+
Directory to save the file in. If not provided, saves to current directory.
|
|
281
|
+
Returns:
|
|
282
|
+
str: The path to the downloaded file.
|
|
283
|
+
"""
|
|
284
|
+
if not download_url:
|
|
285
|
+
raise ValueError("download_url cannot be empty")
|
|
286
|
+
|
|
287
|
+
# Determine filename
|
|
288
|
+
if not filename:
|
|
289
|
+
filename = Path(download_url).name or "downloaded_file"
|
|
290
|
+
|
|
291
|
+
# Determine save directory
|
|
292
|
+
if save_directory:
|
|
293
|
+
save_path = Path(save_directory) / filename
|
|
294
|
+
else:
|
|
295
|
+
save_path = Path(filename)
|
|
296
|
+
|
|
297
|
+
# Create the directory if it doesn't exist
|
|
298
|
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
|
299
|
+
|
|
300
|
+
# Download the file data
|
|
301
|
+
content = self.download_bytes(download_url)
|
|
302
|
+
|
|
303
|
+
# Save the file
|
|
304
|
+
with open(save_path, "wb") as f:
|
|
305
|
+
f.write(content)
|
|
306
|
+
return save_path
|
|
307
|
+
|
|
308
|
+
def download_bytes(
|
|
309
|
+
self,
|
|
310
|
+
download_url: str,
|
|
311
|
+
) -> bytes:
|
|
312
|
+
"""
|
|
313
|
+
Download a file from the given download URL and return the bytes content.
|
|
314
|
+
Args:
|
|
315
|
+
download_url (str):
|
|
316
|
+
The URL to download the file from.
|
|
317
|
+
Returns:
|
|
318
|
+
bytes: The content of the downloaded file as bytes.
|
|
319
|
+
"""
|
|
320
|
+
if not download_url:
|
|
321
|
+
raise ValueError("download_url cannot be empty")
|
|
322
|
+
|
|
323
|
+
# Construct the full URL if it's a relative path
|
|
324
|
+
if download_url.startswith("/"):
|
|
325
|
+
full_url = f"{self.base_api_url}{download_url}"
|
|
326
|
+
else:
|
|
327
|
+
full_url = download_url
|
|
328
|
+
|
|
329
|
+
print("Downloading file from:", full_url)
|
|
330
|
+
try:
|
|
331
|
+
# Download the file
|
|
332
|
+
response = self._client.get(full_url)
|
|
333
|
+
response.raise_for_status()
|
|
334
|
+
|
|
335
|
+
# Return the bytes content
|
|
336
|
+
return response.content
|
|
337
|
+
|
|
338
|
+
except httpx.HTTPStatusError as error:
|
|
339
|
+
status = error.response.status_code
|
|
340
|
+
if status in (401, 403):
|
|
341
|
+
raise AuthenticationError(f"Authentication failed when downloading file: {response.text}") from error
|
|
342
|
+
raise RemoteServerError(f"Failed to download file: {status} {response.text}") from error
|
|
343
|
+
except Exception as error:
|
|
344
|
+
raise RuntimeError(f"File download failed: {error}") from error
|
|
345
|
+
|
|
346
|
+
def close(self):
|
|
347
|
+
self._client.close()
|
|
348
|
+
|
|
349
|
+
def __enter__(self) -> "Poma":
|
|
350
|
+
return self
|
|
351
|
+
|
|
352
|
+
def __exit__(self, *exc):
|
|
353
|
+
self.close()
|
poma/exceptions.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# exceptions.py
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class PomaSDKError(Exception):
|
|
5
|
+
"""Base class for all custom SDK errors."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AuthenticationError(PomaSDKError):
|
|
9
|
+
"""401/403 errors – invalid or missing token."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class RemoteServerError(PomaSDKError):
|
|
13
|
+
"""5xx errors returned by the Poma backend."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class InvalidInputError(PomaSDKError):
|
|
17
|
+
"""Raised when an unsupported *Content‑Type* is given to ``chunk_text``."""
|
|
18
|
+
|
|
19
|
+
class InvalidResponseError(PomaSDKError):
|
|
20
|
+
"""Raised when the server returns non-JSON or empty body."""
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .langchain_poma import (
|
|
2
|
+
PomaFileLoader,
|
|
3
|
+
PomaChunksetSplitter,
|
|
4
|
+
PomaCheatsheetRetrieverLC,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
from .llamaindex_poma import (
|
|
8
|
+
PomaFileReader,
|
|
9
|
+
PomaChunksetNodeParser,
|
|
10
|
+
PomaCheatsheetRetrieverLI,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"PomaFileLoader",
|
|
15
|
+
"PomaChunksetSplitter",
|
|
16
|
+
"PomaCheatsheetRetrieverLC",
|
|
17
|
+
"PomaFileReader",
|
|
18
|
+
"PomaChunksetNodeParser",
|
|
19
|
+
"PomaCheatsheetRetrieverLI",
|
|
20
|
+
]
|