poma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
poma/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ from .client import Poma
2
+ from .exceptions import (
3
+ PomaSDKError,
4
+ AuthenticationError,
5
+ RemoteServerError,
6
+ InvalidInputError,
7
+ )
8
+
9
+ __all__ = [
10
+ "Poma",
11
+ "PomaSDKError",
12
+ "AuthenticationError",
13
+ "RemoteServerError",
14
+ "InvalidInputError",
15
+ ]
poma/client.py ADDED
@@ -0,0 +1,353 @@
1
+ # client.py
2
+ import os
3
+ import httpx
4
+ import zipfile
5
+ import io
6
+ import json
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from poma.exceptions import AuthenticationError, InvalidInputError, RemoteServerError, InvalidResponseError
12
+ from poma.retrieval import generate_cheatsheets, generate_single_cheatsheet
13
+
14
+ USER_AGENT = "poma-ai-sdk/0.1.0"
15
+
16
+ ALLOWED_FILE_EXTENSIONS: set[str] = {
17
+ ".txt",
18
+ ".md",
19
+ ".html",
20
+ ".htm",
21
+ ".pdf",
22
+ }
23
+
24
+ API_BASE_URL = "https://api.poma-ai.com/api/v1"
25
+
26
+
27
+ class Poma:
28
+ """
29
+ Client for interacting with the POMA API.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ api_key: str | None = None,
35
+ *,
36
+ timeout: float = 600.0,
37
+ client: httpx.Client | None = None,
38
+ ):
39
+ """
40
+ Initialize the POMA client.
41
+ Args:
42
+ api_key (str, optional):
43
+ API key for authenticating with POMA. If not provided,
44
+ the value is read from the environment variable `POMA_API_KEY`.
45
+ timeout (float, default=600.0):
46
+ Timeout (in seconds) for all HTTP requests.
47
+ client (httpx.Client, optional):
48
+ Custom HTTP client. If not provided, a default client is created.
49
+ """
50
+ api_base_url = API_BASE_URL
51
+ # Override API base URL if environment variable is set
52
+ if os.environ.get("API_BASE_URL"):
53
+ api_base_url = os.environ.get("API_BASE_URL")
54
+
55
+ self.base_api_url = api_base_url.rstrip("/")
56
+ self._client = client or httpx.Client(timeout=timeout,
57
+ headers={"user-agent": USER_AGENT})
58
+ if not (api_key := api_key or os.environ.get("POMA_API_KEY", "")):
59
+ raise Exception("POMA_API_KEY environment variable not set.")
60
+ self._client.headers.update({"Authorization": f"Bearer {api_key}"})
61
+
62
+ def start_chunk_file(
63
+ self,
64
+ file_path: os.PathLike[str],
65
+ *,
66
+ base_url: str | None = None,
67
+ ) -> dict[str, Any]:
68
+ """
69
+ Submit a file with text to POMA for chunking.
70
+ Args:
71
+ file_path (os.PathLike[str]):
72
+ Path to the input file. Must have an allowed file extension.
73
+ base_url (str, optional):
74
+ Optional base URL to resolve relative links within the file.
75
+ Returns:
76
+ A dictionary containing a unique job identifier for the submitted job.
77
+ """
78
+ if not file_path or not isinstance(file_path, os.PathLike):
79
+ raise ValueError("file_path must be a non-empty os.PathLike.")
80
+ file_extension = Path(file_path).suffix.lower()
81
+ if file_extension not in ALLOWED_FILE_EXTENSIONS:
82
+ raise InvalidInputError(
83
+ f"File extension of {file_path} is not allowed; use one of the following types: {', '.join(sorted(ALLOWED_FILE_EXTENSIONS))}."
84
+ )
85
+ payload = {}
86
+ if base_url:
87
+ payload["base_url"] = base_url
88
+ try:
89
+ response = self._client.post(
90
+ f"{self.base_api_url}/process",
91
+ data=payload,
92
+ files={
93
+ "file": (Path(file_path).name, Path(file_path).read_bytes()),
94
+ },
95
+ )
96
+ response.raise_for_status()
97
+ except httpx.HTTPStatusError as error:
98
+ status = error.response.status_code
99
+ if status in (401, 403):
100
+ raise AuthenticationError(response.text) from error
101
+ raise RemoteServerError(f"{status}: {response.text}") from error
102
+ try:
103
+ data = response.json()
104
+ except ValueError as error:
105
+ raise InvalidResponseError("Server returned non-JSON or empty body") from error
106
+ return data
107
+
108
+ def get_chunk_result(
109
+ self,
110
+ job_id: str,
111
+ *,
112
+ initial_delay: float = 5.0,
113
+ poll_interval: float = 3.0,
114
+ max_interval: float = 15.0,
115
+ show_progress: bool = False,
116
+ download_dir: str | os.PathLike[str] | None = None,
117
+ ) -> dict[str, Any]:
118
+ """
119
+ Poll POMA for the result of a chunking job until completion.
120
+ Args:
121
+ job_id (str):
122
+ The unique identifier of the submitted job.
123
+ initial_delay (float, default=5.0):
124
+ Initial delay (in seconds) before the first poll request.
125
+ poll_interval (float, default=1.0):
126
+ Starting interval (in seconds) between polling requests.
127
+ max_interval (float, default=15.0):
128
+ Maximum interval (in seconds) between polling requests.
129
+ show_progress (bool, default=False):
130
+ If True, logs progress messages during polling.
131
+ download_dir (str | os.PathLike[str], optional):
132
+ Directory to save the downloaded file in. Required if return_bytes=False.
133
+ return_bytes (bool, default=False):
134
+ If True, returns the file content as bytes instead of saving to disk.
135
+ Returns:
136
+ The JSON result containing at least the keys `chunks` and `chunksets`.
137
+
138
+ """
139
+ time.sleep(initial_delay)
140
+ current_interval = poll_interval
141
+ while True:
142
+ time.sleep(current_interval)
143
+ try:
144
+ response = self._client.get(f"{self.base_api_url}/jobs/{job_id}/status")
145
+ response.raise_for_status()
146
+ data = response.json()
147
+ status = data.get("status", "")
148
+ if status == "done":
149
+ download = data.get("download", {})
150
+ download_url = download.get("download_url", "")
151
+ if not download_url:
152
+ raise RuntimeError("Failed to receive download URL from server.")
153
+
154
+ if download_dir is None:
155
+ # Return bytes content instead of saving to file
156
+ file_bytes = self.download_bytes(download_url)
157
+ return self.extract_chunks_and_chunksets_from_poma_archive(poma_archive_data=file_bytes)
158
+ else:
159
+ # Save downloaded file to directory
160
+ filename = download.get("filename", "downloaded_file.poma")
161
+ downloaded_file_path = self.download_file(download_url,
162
+ filename,
163
+ save_directory=download_dir)
164
+ return self.extract_chunks_and_chunksets_from_poma_archive(poma_archive_path=downloaded_file_path)
165
+ elif status == "failed":
166
+ error_code = data.get("code", "unknown")
167
+ error_details = data.get("error", "No details provided.")
168
+ error_message = (
169
+ f"Job failed with code {error_code}: {error_details}"
170
+ )
171
+ raise RemoteServerError(
172
+ f"Job failed: {data.get('error', error_message)}"
173
+ )
174
+ elif status == "processing":
175
+ if show_progress:
176
+ print(f"Job {job_id} is still processing...")
177
+ current_interval = min(current_interval * 1.5, max_interval)
178
+ else:
179
+ raise InvalidResponseError(f"Unexpected job status: {status}")
180
+ except httpx.HTTPStatusError as error:
181
+ raise RemoteServerError(
182
+ f"HTTP error: {error.response.status_code} {error.response.text}"
183
+ ) from error
184
+ except Exception as error:
185
+ raise RuntimeError(f"POMA-AI job polling failed: {error}") from error
186
+
187
+ def extract_chunks_and_chunksets_from_poma_archive(
188
+ self,
189
+ poma_archive_data: bytes | None = None,
190
+ poma_archive_path: str | os.PathLike[str] | None = None,
191
+ ) -> dict[str, Any]:
192
+ """
193
+ Extract POMA archive file.
194
+ POMA archive file is a zip file containing the chunks.json and chunksets.json files.
195
+ Args:
196
+ poma_archive (bytes): The POMA archive file.
197
+ Returns:
198
+ dict: A dictionary containing the chunks and chunksets.
199
+ """
200
+ # Sanity check for parameters
201
+ if not poma_archive_data and not poma_archive_path:
202
+ raise ValueError("Either poma_archive_data or poma_archive_path must be provided.")
203
+
204
+ # Load the chunks and chunksets from POMA archive
205
+ chunks = None
206
+ chunksets = None
207
+ if poma_archive_path:
208
+ with zipfile.ZipFile(poma_archive_path, "r") as zip_ref:
209
+ chunks = zip_ref.read('chunks.json')
210
+ chunksets = zip_ref.read('chunksets.json')
211
+ else:
212
+ with zipfile.ZipFile(io.BytesIO(poma_archive_data), "r") as zip_ref:
213
+ chunks = zip_ref.read('chunks.json')
214
+ chunksets = zip_ref.read('chunksets.json')
215
+
216
+ # Sanity check
217
+ if not chunks or not chunksets:
218
+ raise KeyError(
219
+ "Result must contain 'chunks' and 'chunksets' keys."
220
+ )
221
+
222
+ # Load the chunks and chunksets
223
+ json_result = {
224
+ "chunks": json.loads(chunks),
225
+ "chunksets": json.loads(chunksets)
226
+ }
227
+ return json_result
228
+
229
+ def create_cheatsheet(
230
+ self,
231
+ relevant_chunksets: list[dict[str, Any]],
232
+ all_chunks: list[dict[str, Any]],
233
+ ) -> str:
234
+ """
235
+ Generates a single cheatsheet for one single document
236
+ from relevant chunksets (relevant for a certain query)
237
+ and from all available chunks (which must contain the textual content).
238
+ Args:
239
+ relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
240
+ all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
241
+ Returns:
242
+ str: The textual content of the generated cheatsheet.
243
+ """
244
+ return generate_single_cheatsheet(relevant_chunksets, all_chunks)
245
+
246
+ def create_cheatsheets(
247
+ self,
248
+ relevant_chunksets: list[dict[str, Any]],
249
+ all_chunks: list[dict[str, Any]],
250
+ ) -> list[dict[str, Any]]:
251
+ """
252
+ Generates cheatsheets from relevant chunksets (relevant for a certain query)
253
+ and from all available chunks (which must contain the textual content).
254
+ One cheatsheet is created for each document tag found in the chunks.
255
+ Args:
256
+ relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
257
+ all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
258
+ Returns:
259
+ list[dict]: A list of dictionaries representing the generated cheatsheets, each containing:
260
+ - 'tag': The tag associated with the respective document.
261
+ - 'content': The textual content of the generated cheatsheet.
262
+ """
263
+ return generate_cheatsheets(relevant_chunksets, all_chunks)
264
+
265
+ def download_file(
266
+ self,
267
+ download_url: str,
268
+ filename: str | None = None,
269
+ *,
270
+ save_directory: str | os.PathLike[str] | None = None,
271
+ ) -> str:
272
+ """
273
+ Download a file from the given download URL.
274
+ Args:
275
+ download_url (str):
276
+ The URL to download the file from.
277
+ filename (str, optional):
278
+ The filename to save the file as. If not provided, will be extracted from URL.
279
+ save_directory (str | os.PathLike[str], optional):
280
+ Directory to save the file in. If not provided, saves to current directory.
281
+ Returns:
282
+ str: The path to the downloaded file.
283
+ """
284
+ if not download_url:
285
+ raise ValueError("download_url cannot be empty")
286
+
287
+ # Determine filename
288
+ if not filename:
289
+ filename = Path(download_url).name or "downloaded_file"
290
+
291
+ # Determine save directory
292
+ if save_directory:
293
+ save_path = Path(save_directory) / filename
294
+ else:
295
+ save_path = Path(filename)
296
+
297
+ # Create the directory if it doesn't exist
298
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
299
+
300
+ # Download the file data
301
+ content = self.download_bytes(download_url)
302
+
303
+ # Save the file
304
+ with open(save_path, "wb") as f:
305
+ f.write(content)
306
+ return save_path
307
+
308
+ def download_bytes(
309
+ self,
310
+ download_url: str,
311
+ ) -> bytes:
312
+ """
313
+ Download a file from the given download URL and return the bytes content.
314
+ Args:
315
+ download_url (str):
316
+ The URL to download the file from.
317
+ Returns:
318
+ bytes: The content of the downloaded file as bytes.
319
+ """
320
+ if not download_url:
321
+ raise ValueError("download_url cannot be empty")
322
+
323
+ # Construct the full URL if it's a relative path
324
+ if download_url.startswith("/"):
325
+ full_url = f"{self.base_api_url}{download_url}"
326
+ else:
327
+ full_url = download_url
328
+
329
+ print("Downloading file from:", full_url)
330
+ try:
331
+ # Download the file
332
+ response = self._client.get(full_url)
333
+ response.raise_for_status()
334
+
335
+ # Return the bytes content
336
+ return response.content
337
+
338
+ except httpx.HTTPStatusError as error:
339
+ status = error.response.status_code
340
+ if status in (401, 403):
341
+ raise AuthenticationError(f"Authentication failed when downloading file: {response.text}") from error
342
+ raise RemoteServerError(f"Failed to download file: {status} {response.text}") from error
343
+ except Exception as error:
344
+ raise RuntimeError(f"File download failed: {error}") from error
345
+
346
+ def close(self):
347
+ self._client.close()
348
+
349
+ def __enter__(self) -> "Poma":
350
+ return self
351
+
352
+ def __exit__(self, *exc):
353
+ self.close()
poma/exceptions.py ADDED
@@ -0,0 +1,20 @@
1
+ # exceptions.py
2
+
3
+
4
+ class PomaSDKError(Exception):
5
+ """Base class for all custom SDK errors."""
6
+
7
+
8
+ class AuthenticationError(PomaSDKError):
9
+ """401/403 errors – invalid or missing token."""
10
+
11
+
12
+ class RemoteServerError(PomaSDKError):
13
+ """5xx errors returned by the Poma backend."""
14
+
15
+
16
+ class InvalidInputError(PomaSDKError):
17
+ """Raised when an unsupported *Content‑Type* is given to ``chunk_text``."""
18
+
19
+ class InvalidResponseError(PomaSDKError):
20
+ """Raised when the server returns non-JSON or empty body."""
@@ -0,0 +1,20 @@
1
+ from .langchain_poma import (
2
+ PomaFileLoader,
3
+ PomaChunksetSplitter,
4
+ PomaCheatsheetRetrieverLC,
5
+ )
6
+
7
+ from .llamaindex_poma import (
8
+ PomaFileReader,
9
+ PomaChunksetNodeParser,
10
+ PomaCheatsheetRetrieverLI,
11
+ )
12
+
13
+ __all__ = [
14
+ "PomaFileLoader",
15
+ "PomaChunksetSplitter",
16
+ "PomaCheatsheetRetrieverLC",
17
+ "PomaFileReader",
18
+ "PomaChunksetNodeParser",
19
+ "PomaCheatsheetRetrieverLI",
20
+ ]