poma 0.2.2__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poma/client.py +90 -47
- poma/integrations/__init__.py +0 -20
- poma/integrations/langchain/__init__.py +12 -0
- poma/integrations/llamaindex/__init__.py +11 -0
- poma/integrations/qdrant/__init__.py +23 -0
- poma/integrations/qdrant/qdrant_poma.py +1326 -0
- {poma-0.2.2.dist-info → poma-0.3.2.dist-info}/METADATA +28 -20
- poma-0.3.2.dist-info/RECORD +16 -0
- {poma-0.2.2.dist-info → poma-0.3.2.dist-info}/WHEEL +1 -1
- poma-0.2.2.dist-info/RECORD +0 -12
- /poma/integrations/{langchain_poma.py → langchain/langchain_poma.py} +0 -0
- /poma/integrations/{llamaindex_poma.py → llamaindex/llamaindex_poma.py} +0 -0
- {poma-0.2.2.dist-info → poma-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {poma-0.2.2.dist-info → poma-0.3.2.dist-info}/top_level.txt +0 -0
poma/client.py
CHANGED
|
@@ -21,6 +21,38 @@ USER_AGENT = "poma-ai-sdk/0.1.0"
|
|
|
21
21
|
API_BASE_URL = "https://api.poma-ai.com/api/v1"
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
def extract_chunks_and_chunksets_from_poma_archive(
|
|
25
|
+
poma_archive_data: bytes | None = None,
|
|
26
|
+
poma_archive_path: str | os.PathLike[str] | None = None,
|
|
27
|
+
) -> dict[str, Any]:
|
|
28
|
+
"""
|
|
29
|
+
Extract chunks and chunksets from a POMA archive file.
|
|
30
|
+
POMA archive is a zip file containing chunks.json and chunksets.json.
|
|
31
|
+
Args:
|
|
32
|
+
poma_archive_data: The POMA archive as bytes.
|
|
33
|
+
poma_archive_path: Path to the POMA archive file.
|
|
34
|
+
Returns:
|
|
35
|
+
dict: A dictionary with ``chunks`` and ``chunksets`` keys.
|
|
36
|
+
"""
|
|
37
|
+
chunks = None
|
|
38
|
+
chunksets = None
|
|
39
|
+
if poma_archive_path:
|
|
40
|
+
with zipfile.ZipFile(poma_archive_path, "r") as zip_ref:
|
|
41
|
+
chunks = zip_ref.read("chunks.json")
|
|
42
|
+
chunksets = zip_ref.read("chunksets.json")
|
|
43
|
+
elif poma_archive_data:
|
|
44
|
+
with zipfile.ZipFile(io.BytesIO(poma_archive_data), "r") as zip_ref:
|
|
45
|
+
chunks = zip_ref.read("chunks.json")
|
|
46
|
+
chunksets = zip_ref.read("chunksets.json")
|
|
47
|
+
else:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
"Either poma_archive_data or poma_archive_path must be provided."
|
|
50
|
+
)
|
|
51
|
+
if not chunks or not chunksets:
|
|
52
|
+
raise KeyError("Result must contain 'chunks' and 'chunksets' keys.")
|
|
53
|
+
return {"chunks": json.loads(chunks), "chunksets": json.loads(chunksets)}
|
|
54
|
+
|
|
55
|
+
|
|
24
56
|
class Poma:
|
|
25
57
|
"""
|
|
26
58
|
Client for interacting with the POMA API.
|
|
@@ -61,23 +93,25 @@ class Poma:
|
|
|
61
93
|
|
|
62
94
|
def start_chunk_file(
|
|
63
95
|
self,
|
|
64
|
-
file_path: os.PathLike[str],
|
|
96
|
+
file_path: str | os.PathLike[str],
|
|
65
97
|
*,
|
|
66
98
|
base_url: str | None = None,
|
|
67
99
|
) -> dict[str, Any]:
|
|
68
100
|
"""
|
|
69
101
|
Submit a file with text to POMA for chunking.
|
|
70
102
|
Args:
|
|
71
|
-
file_path (os.PathLike[str]):
|
|
72
|
-
Path to the input file. Must have an allowed file extension.
|
|
103
|
+
file_path (str | os.PathLike[str]):
|
|
104
|
+
Path to the input file (string or path-like). Must have an allowed file extension.
|
|
73
105
|
base_url (str, optional):
|
|
74
106
|
Optional base URL to resolve relative links within the file.
|
|
75
107
|
Returns:
|
|
76
108
|
A dictionary containing a unique job identifier for the submitted job.
|
|
77
109
|
"""
|
|
78
|
-
if
|
|
79
|
-
raise ValueError("file_path must be a non-empty
|
|
110
|
+
if file_path is None or (isinstance(file_path, str) and not file_path.strip()):
|
|
111
|
+
raise ValueError("file_path must be a non-empty string or path-like.")
|
|
112
|
+
path = Path(file_path)
|
|
80
113
|
payload = {}
|
|
114
|
+
payload["is_sdk"] = True
|
|
81
115
|
if base_url:
|
|
82
116
|
payload["base_url"] = base_url
|
|
83
117
|
try:
|
|
@@ -85,7 +119,7 @@ class Poma:
|
|
|
85
119
|
f"{self.base_api_url}/ingest",
|
|
86
120
|
data=payload,
|
|
87
121
|
files={
|
|
88
|
-
"file": (
|
|
122
|
+
"file": (path.name, path.read_bytes()),
|
|
89
123
|
},
|
|
90
124
|
)
|
|
91
125
|
response.raise_for_status()
|
|
@@ -93,10 +127,10 @@ class Poma:
|
|
|
93
127
|
status = error.response.status_code
|
|
94
128
|
if status in (401, 403):
|
|
95
129
|
raise AuthenticationError(
|
|
96
|
-
f"Failed to submit file '{
|
|
130
|
+
f"Failed to submit file '{path}': authentication error"
|
|
97
131
|
) from error
|
|
98
132
|
raise RemoteServerError(
|
|
99
|
-
f"Failed to submit file '{
|
|
133
|
+
f"Failed to submit file '{path}': {status}"
|
|
100
134
|
) from error
|
|
101
135
|
try:
|
|
102
136
|
data = response.json()
|
|
@@ -115,6 +149,7 @@ class Poma:
|
|
|
115
149
|
max_interval: float = 15.0,
|
|
116
150
|
show_progress: bool = False,
|
|
117
151
|
download_dir: str | os.PathLike[str] | None = None,
|
|
152
|
+
filename: str | None = None,
|
|
118
153
|
) -> dict[str, Any]:
|
|
119
154
|
"""
|
|
120
155
|
Poll POMA for the result of a chunking job until completion.
|
|
@@ -130,15 +165,22 @@ class Poma:
|
|
|
130
165
|
show_progress (bool, default=False):
|
|
131
166
|
If True, logs progress messages during polling.
|
|
132
167
|
download_dir (str | os.PathLike[str], optional):
|
|
133
|
-
Directory to save the downloaded file in.
|
|
134
|
-
|
|
135
|
-
|
|
168
|
+
Directory to save the downloaded file in. If neither download_dir nor
|
|
169
|
+
filename is set, the result is returned in memory (no file saved). If
|
|
170
|
+
filename is set but download_dir is not, the file is saved in the
|
|
171
|
+
current directory.
|
|
172
|
+
filename (str, optional):
|
|
173
|
+
Name for the saved .poma file. If it does not end with ``.poma``, that
|
|
174
|
+
suffix is appended. If not set when saving to disk, uses the server
|
|
175
|
+
filename when provided, otherwise ``{job_id}.poma``.
|
|
136
176
|
Returns:
|
|
137
177
|
The JSON result containing at least the keys `chunks` and `chunksets`.
|
|
138
178
|
|
|
139
179
|
"""
|
|
140
180
|
time.sleep(initial_delay)
|
|
141
181
|
current_interval = poll_interval
|
|
182
|
+
last_status = None
|
|
183
|
+
|
|
142
184
|
while True:
|
|
143
185
|
time.sleep(current_interval)
|
|
144
186
|
try:
|
|
@@ -154,17 +196,26 @@ class Poma:
|
|
|
154
196
|
"Failed to receive download URL from server."
|
|
155
197
|
)
|
|
156
198
|
|
|
157
|
-
if download_dir is None:
|
|
199
|
+
if download_dir is None and filename is None:
|
|
158
200
|
# Return bytes content instead of saving to file
|
|
159
201
|
file_bytes = self.download_bytes(download_url)
|
|
160
202
|
return self.extract_chunks_and_chunksets_from_poma_archive(
|
|
161
203
|
poma_archive_data=file_bytes
|
|
162
204
|
)
|
|
163
205
|
else:
|
|
164
|
-
# Save downloaded file to
|
|
165
|
-
|
|
206
|
+
# Save downloaded file (to download_dir or current dir if only filename set)
|
|
207
|
+
save_filename = (
|
|
208
|
+
filename or download.get("filename") or f"{job_id}.poma"
|
|
209
|
+
)
|
|
210
|
+
if not save_filename.endswith(".poma"):
|
|
211
|
+
save_filename = f"{save_filename}.poma"
|
|
212
|
+
save_dir = (
|
|
213
|
+
download_dir
|
|
214
|
+
if download_dir not in (None, "")
|
|
215
|
+
else "."
|
|
216
|
+
)
|
|
166
217
|
downloaded_file_path = self.download_file(
|
|
167
|
-
download_url,
|
|
218
|
+
download_url, save_filename, save_directory=save_dir
|
|
168
219
|
)
|
|
169
220
|
return self.extract_chunks_and_chunksets_from_poma_archive(
|
|
170
221
|
poma_archive_path=downloaded_file_path
|
|
@@ -181,9 +232,18 @@ class Poma:
|
|
|
181
232
|
elif status == "processing":
|
|
182
233
|
if show_progress:
|
|
183
234
|
print(f"Job {job_id} is still processing...")
|
|
235
|
+
if last_status == "pending":
|
|
236
|
+
current_interval = poll_interval
|
|
237
|
+
current_interval = min(current_interval * 1.5, max_interval)
|
|
238
|
+
elif status == "pending":
|
|
239
|
+
if show_progress:
|
|
240
|
+
print(
|
|
241
|
+
f"Job {job_id} is pending (queued due to rate limiting, sequential processing - common on demo accounts)..."
|
|
242
|
+
)
|
|
184
243
|
current_interval = min(current_interval * 1.5, max_interval)
|
|
185
244
|
else:
|
|
186
245
|
raise InvalidResponseError(f"Unexpected job status: {status}")
|
|
246
|
+
last_status = status
|
|
187
247
|
except httpx.HTTPStatusError as error:
|
|
188
248
|
raise RemoteServerError(
|
|
189
249
|
f"HTTP error: {error.response.status_code} {error.response.text}"
|
|
@@ -197,37 +257,18 @@ class Poma:
|
|
|
197
257
|
poma_archive_path: str | os.PathLike[str] | None = None,
|
|
198
258
|
) -> dict[str, Any]:
|
|
199
259
|
"""
|
|
200
|
-
Extract POMA archive
|
|
201
|
-
POMA archive
|
|
260
|
+
Extract chunks and chunksets from a POMA archive; delegates to module-level function.
|
|
261
|
+
POMA archive is a zip file containing chunks.json and chunksets.json.
|
|
202
262
|
Args:
|
|
203
|
-
|
|
263
|
+
poma_archive_data: The POMA archive as bytes.
|
|
264
|
+
poma_archive_path: Path to the POMA archive file.
|
|
204
265
|
Returns:
|
|
205
|
-
dict: A dictionary
|
|
266
|
+
dict: A dictionary with ``chunks`` and ``chunksets`` keys.
|
|
206
267
|
"""
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
if poma_archive_path:
|
|
212
|
-
with zipfile.ZipFile(poma_archive_path, "r") as zip_ref:
|
|
213
|
-
chunks = zip_ref.read("chunks.json")
|
|
214
|
-
chunksets = zip_ref.read("chunksets.json")
|
|
215
|
-
elif poma_archive_data:
|
|
216
|
-
with zipfile.ZipFile(io.BytesIO(poma_archive_data), "r") as zip_ref:
|
|
217
|
-
chunks = zip_ref.read("chunks.json")
|
|
218
|
-
chunksets = zip_ref.read("chunksets.json")
|
|
219
|
-
else:
|
|
220
|
-
raise ValueError(
|
|
221
|
-
"Either poma_archive_data or poma_archive_path must be provided."
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
# Sanity check
|
|
225
|
-
if not chunks or not chunksets:
|
|
226
|
-
raise KeyError("Result must contain 'chunks' and 'chunksets' keys.")
|
|
227
|
-
|
|
228
|
-
# Load the chunks and chunksets
|
|
229
|
-
json_result = {"chunks": json.loads(chunks), "chunksets": json.loads(chunksets)}
|
|
230
|
-
return json_result
|
|
268
|
+
return extract_chunks_and_chunksets_from_poma_archive(
|
|
269
|
+
poma_archive_data=poma_archive_data,
|
|
270
|
+
poma_archive_path=poma_archive_path,
|
|
271
|
+
)
|
|
231
272
|
|
|
232
273
|
def create_cheatsheet(
|
|
233
274
|
self,
|
|
@@ -301,14 +342,16 @@ class Poma:
|
|
|
301
342
|
if not filename:
|
|
302
343
|
filename = Path(download_url).name or "downloaded_file"
|
|
303
344
|
|
|
304
|
-
# Determine save directory
|
|
345
|
+
# Determine save directory (default current directory so path has a parent)
|
|
305
346
|
if save_directory:
|
|
306
347
|
save_path = Path(save_directory) / filename
|
|
307
348
|
else:
|
|
308
|
-
save_path = Path(filename
|
|
349
|
+
save_path = Path(".") / filename
|
|
309
350
|
|
|
310
|
-
# Create the directory if it doesn't exist
|
|
311
|
-
os.
|
|
351
|
+
# Create the directory if it doesn't exist (skip when file is in cwd)
|
|
352
|
+
parent = os.path.dirname(save_path)
|
|
353
|
+
if parent:
|
|
354
|
+
os.makedirs(parent, exist_ok=True)
|
|
312
355
|
|
|
313
356
|
# Download the file data
|
|
314
357
|
content = self.download_bytes(download_url)
|
poma/integrations/__init__.py
CHANGED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
from .langchain_poma import (
|
|
2
|
-
PomaFileLoader,
|
|
3
|
-
PomaChunksetSplitter,
|
|
4
|
-
PomaCheatsheetRetrieverLC,
|
|
5
|
-
)
|
|
6
|
-
|
|
7
|
-
from .llamaindex_poma import (
|
|
8
|
-
PomaFileReader,
|
|
9
|
-
PomaChunksetNodeParser,
|
|
10
|
-
PomaCheatsheetRetrieverLI,
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
__all__ = [
|
|
14
|
-
"PomaFileLoader",
|
|
15
|
-
"PomaChunksetSplitter",
|
|
16
|
-
"PomaCheatsheetRetrieverLC",
|
|
17
|
-
"PomaFileReader",
|
|
18
|
-
"PomaChunksetNodeParser",
|
|
19
|
-
"PomaCheatsheetRetrieverLI",
|
|
20
|
-
]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from .qdrant_poma import (
|
|
2
|
+
PomaQdrant,
|
|
3
|
+
QdrantConfig,
|
|
4
|
+
VectorConfig,
|
|
5
|
+
InferenceConfig,
|
|
6
|
+
QdrantResponseError,
|
|
7
|
+
SearchResult,
|
|
8
|
+
chunk_uuid_string,
|
|
9
|
+
DenseEmbedSync,
|
|
10
|
+
SparseEmbedSync,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"PomaQdrant",
|
|
15
|
+
"QdrantConfig",
|
|
16
|
+
"VectorConfig",
|
|
17
|
+
"InferenceConfig",
|
|
18
|
+
"QdrantResponseError",
|
|
19
|
+
"SearchResult",
|
|
20
|
+
"chunk_uuid_string",
|
|
21
|
+
"DenseEmbedSync",
|
|
22
|
+
"SparseEmbedSync",
|
|
23
|
+
]
|