poma 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poma/__init__.py +15 -0
- poma/client.py +353 -0
- poma/exceptions.py +20 -0
- poma/integrations/__init__.py +20 -0
- poma/integrations/langchain_poma.py +358 -0
- poma/integrations/llamaindex_poma.py +361 -0
- poma/retrieval.py +176 -0
- poma-0.0.0.dist-info/METADATA +66 -0
- poma-0.0.0.dist-info/RECORD +12 -0
- poma-0.0.0.dist-info/WHEEL +5 -0
- poma-0.0.0.dist-info/licenses/LICENSE +177 -0
- poma-0.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
# ---------------------------------------------------------------------
|
|
2
|
+
# POMA integration for LlamaIndex
|
|
3
|
+
# ---------------------------------------------------------------------
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import hashlib
|
|
7
|
+
from typing import Any
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from collections.abc import Sequence, Iterable
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
12
|
+
|
|
13
|
+
from llama_index.core.readers.base import BaseReader
|
|
14
|
+
from llama_index.core.node_parser import NodeParser
|
|
15
|
+
from llama_index.core.retrievers import BaseRetriever
|
|
16
|
+
from llama_index.core.schema import (
|
|
17
|
+
Document,
|
|
18
|
+
NodeWithScore,
|
|
19
|
+
TextNode,
|
|
20
|
+
BaseNode,
|
|
21
|
+
QueryBundle,
|
|
22
|
+
)
|
|
23
|
+
from pydantic import PrivateAttr
|
|
24
|
+
|
|
25
|
+
from poma import Poma
|
|
26
|
+
from poma.client import ALLOWED_FILE_EXTENSIONS
|
|
27
|
+
from poma.retrieval import _cheatsheets_from_chunks
|
|
28
|
+
from poma.exceptions import InvalidInputError
|
|
29
|
+
|
|
30
|
+
__all__ = ["PomaFileReader", "PomaChunksetNodeParser", "PomaCheatsheetRetrieverLI"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ------------------------------------------------------------------ #
|
|
34
|
+
# Load from Path β LI Documents #
|
|
35
|
+
# ------------------------------------------------------------------ #
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class PomaFileReader(BaseReader):
|
|
39
|
+
|
|
40
|
+
def load_data(self, input_path: str | Path) -> list[Document]:
|
|
41
|
+
"""
|
|
42
|
+
Load files from the input path (file or directory) into LlamaIndex Documents.
|
|
43
|
+
Only files with allowed extensions are processed; others are skipped.
|
|
44
|
+
"""
|
|
45
|
+
path = Path(input_path).expanduser().resolve()
|
|
46
|
+
if not path.exists():
|
|
47
|
+
raise FileNotFoundError(f"No such path: {path}")
|
|
48
|
+
|
|
49
|
+
documents: list[Document] = []
|
|
50
|
+
skipped: int = 0
|
|
51
|
+
|
|
52
|
+
def _process_file(file_path: Path):
|
|
53
|
+
nonlocal skipped, documents
|
|
54
|
+
if not file_path.is_file():
|
|
55
|
+
return
|
|
56
|
+
file_extension = file_path.suffix.lower()
|
|
57
|
+
if not file_extension or file_extension not in ALLOWED_FILE_EXTENSIONS:
|
|
58
|
+
skipped += 1
|
|
59
|
+
return
|
|
60
|
+
file_bytes = file_path.read_bytes()
|
|
61
|
+
file_hash = hashlib.md5(file_bytes).hexdigest()
|
|
62
|
+
if file_extension == ".pdf":
|
|
63
|
+
# LlamaIndex requires `text` to be str.
|
|
64
|
+
# Actual file processing happens downstream in the node parser.
|
|
65
|
+
text_payload: str = ""
|
|
66
|
+
else:
|
|
67
|
+
try:
|
|
68
|
+
text_payload = file_bytes.decode("utf-8")
|
|
69
|
+
except UnicodeDecodeError:
|
|
70
|
+
skipped += 1
|
|
71
|
+
return
|
|
72
|
+
documents.append(
|
|
73
|
+
Document(
|
|
74
|
+
text=text_payload,
|
|
75
|
+
metadata={
|
|
76
|
+
"source_path": str(file_path),
|
|
77
|
+
"doc_id": f"{file_hash}",
|
|
78
|
+
},
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if path.is_file():
|
|
83
|
+
_process_file(path)
|
|
84
|
+
elif path.is_dir():
|
|
85
|
+
for path_in_dir in sorted(path.rglob("*")):
|
|
86
|
+
_process_file(path_in_dir)
|
|
87
|
+
else:
|
|
88
|
+
raise FileNotFoundError(f"Unsupported path type (not file/dir): {path}")
|
|
89
|
+
|
|
90
|
+
allowed = ", ".join(sorted(ALLOWED_FILE_EXTENSIONS))
|
|
91
|
+
if not documents:
|
|
92
|
+
raise InvalidInputError(f"No supported files found. Allowed: {allowed}")
|
|
93
|
+
if skipped > 0:
|
|
94
|
+
print(
|
|
95
|
+
f"Skipped {skipped} file(s) due to unsupported or unreadable type. Allowed: {allowed}"
|
|
96
|
+
)
|
|
97
|
+
return documents
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------------------------------ #
|
|
101
|
+
# Generate Chunksets #
|
|
102
|
+
# ------------------------------------------------------------------ #
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class PomaChunksetNodeParser(NodeParser):
|
|
106
|
+
# """Calls **POMA API** for each document, choosing text vs file ingestion as needed."""
|
|
107
|
+
|
|
108
|
+
_client: Poma = PrivateAttr()
|
|
109
|
+
|
|
110
|
+
def __init__(self, *, client: Poma):
|
|
111
|
+
"""Initialize with Poma client instance."""
|
|
112
|
+
super().__init__()
|
|
113
|
+
self._client = client
|
|
114
|
+
|
|
115
|
+
def _parse_nodes(
|
|
116
|
+
self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
|
|
117
|
+
) -> list[BaseNode]:
|
|
118
|
+
"""Not implemented, use _get_nodes_from_documents()."""
|
|
119
|
+
raise NotImplementedError("Not implemented, use _get_nodes_from_documents().")
|
|
120
|
+
|
|
121
|
+
def _get_nodes_from_documents(
|
|
122
|
+
self, documents: Sequence[Document], show_progress: bool = False
|
|
123
|
+
) -> list[BaseNode]:
|
|
124
|
+
"""
|
|
125
|
+
Convert LlamaIndex Documents into chunkset Nodes via POMA API.
|
|
126
|
+
Each output Node represents a chunkset, with associated chunks in metadata.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
documents = list(documents)
|
|
130
|
+
if not documents:
|
|
131
|
+
raise InvalidInputError("No documents provided to process.")
|
|
132
|
+
|
|
133
|
+
total_docs = len(documents)
|
|
134
|
+
chunked_nodes: list[BaseNode] = []
|
|
135
|
+
failed_paths: list[str] = []
|
|
136
|
+
|
|
137
|
+
def _safe_int(value: object) -> int | None:
|
|
138
|
+
if isinstance(value, bool):
|
|
139
|
+
return None
|
|
140
|
+
if isinstance(value, int):
|
|
141
|
+
return value
|
|
142
|
+
if isinstance(value, str):
|
|
143
|
+
try:
|
|
144
|
+
return int(value.strip())
|
|
145
|
+
except Exception:
|
|
146
|
+
return None
|
|
147
|
+
try:
|
|
148
|
+
return int(value) # type: ignore[arg-type]
|
|
149
|
+
except Exception:
|
|
150
|
+
return None
|
|
151
|
+
|
|
152
|
+
def _doc_id_and_src(doc: Document) -> tuple[str, str]:
|
|
153
|
+
src_path = doc.metadata.get("source_path", "in-memory-text")
|
|
154
|
+
doc_id = doc.metadata.get("doc_id") or Path(src_path).stem or "unknown-doc"
|
|
155
|
+
return doc_id, src_path
|
|
156
|
+
|
|
157
|
+
def _process_one(
|
|
158
|
+
poma_doc: Document, doc_idx: int, total_docs: int
|
|
159
|
+
) -> tuple[list[BaseNode], str | None]:
|
|
160
|
+
"""Process a single document via POMA API, return chunkset nodes or failed source path."""
|
|
161
|
+
try:
|
|
162
|
+
doc_id, src_path = _doc_id_and_src(poma_doc)
|
|
163
|
+
path_obj: Path | None = None
|
|
164
|
+
if src_path and src_path.strip() and isinstance(src_path, str):
|
|
165
|
+
try:
|
|
166
|
+
path = Path(src_path).expanduser().resolve()
|
|
167
|
+
if path.exists():
|
|
168
|
+
path_obj = path
|
|
169
|
+
except Exception:
|
|
170
|
+
path_obj = None
|
|
171
|
+
if not path_obj:
|
|
172
|
+
raise InvalidInputError(
|
|
173
|
+
"No valid source_path found in document metadata."
|
|
174
|
+
)
|
|
175
|
+
start_result = self._client.start_chunk_file(path_obj, base_url=None)
|
|
176
|
+
job_id = start_result.get("job_id")
|
|
177
|
+
if not job_id:
|
|
178
|
+
raise RuntimeError("Failed to receive job ID from server.")
|
|
179
|
+
if show_progress:
|
|
180
|
+
print(
|
|
181
|
+
f"[{doc_idx}/{total_docs}] β³ Job {job_id} started for: {src_path}. Polling for results..."
|
|
182
|
+
)
|
|
183
|
+
result = self._client.get_chunk_result(
|
|
184
|
+
str(job_id), show_progress=show_progress
|
|
185
|
+
)
|
|
186
|
+
chunks: list[dict] = result.get("chunks", [])
|
|
187
|
+
chunksets: list[dict] = result.get("chunksets", [])
|
|
188
|
+
except Exception as exception:
|
|
189
|
+
print(
|
|
190
|
+
f"[{doc_idx}/{total_docs}] β Exception chunking document: {exception}"
|
|
191
|
+
)
|
|
192
|
+
src_path = poma_doc.metadata.get("source_path", "in-memory-text")
|
|
193
|
+
return [], src_path
|
|
194
|
+
|
|
195
|
+
file_nodes: list[BaseNode] = []
|
|
196
|
+
try:
|
|
197
|
+
chunks_by_index: dict[int, dict] = {}
|
|
198
|
+
for chunk in chunks:
|
|
199
|
+
idx = _safe_int(chunk.get("chunk_index"))
|
|
200
|
+
if idx is not None:
|
|
201
|
+
chunks_by_index[idx] = chunk
|
|
202
|
+
for cs in chunksets:
|
|
203
|
+
chunkset_index = cs.get("chunkset_index")
|
|
204
|
+
chunks_indices = cs.get("chunks", []) or []
|
|
205
|
+
normalized_indices: list[int] = []
|
|
206
|
+
for chunk_index in chunks_indices:
|
|
207
|
+
idx = _safe_int(chunk_index)
|
|
208
|
+
if idx is not None:
|
|
209
|
+
normalized_indices.append(idx)
|
|
210
|
+
relevant_chunks = [
|
|
211
|
+
chunks_by_index[idx]
|
|
212
|
+
for idx in normalized_indices
|
|
213
|
+
if idx in chunks_by_index
|
|
214
|
+
]
|
|
215
|
+
text_node = TextNode(
|
|
216
|
+
text=cs.get("contents", ""),
|
|
217
|
+
metadata={
|
|
218
|
+
"doc_id": doc_id,
|
|
219
|
+
"chunkset_index": chunkset_index,
|
|
220
|
+
"chunks": relevant_chunks,
|
|
221
|
+
"chunkset": cs,
|
|
222
|
+
"source_path": src_path,
|
|
223
|
+
},
|
|
224
|
+
)
|
|
225
|
+
# Keep embeddings clean β just embed content, not metadata
|
|
226
|
+
text_node.excluded_embed_metadata_keys = list(
|
|
227
|
+
text_node.metadata.keys()
|
|
228
|
+
)
|
|
229
|
+
file_nodes.append(text_node)
|
|
230
|
+
except Exception as exception:
|
|
231
|
+
print(
|
|
232
|
+
f"[{doc_idx}/{total_docs}] β Exception processing chunking result: {exception}"
|
|
233
|
+
)
|
|
234
|
+
src_path = poma_doc.metadata.get("source_path", "in-memory-text")
|
|
235
|
+
return [], src_path
|
|
236
|
+
return file_nodes, None
|
|
237
|
+
|
|
238
|
+
# parallel processing of documents
|
|
239
|
+
cores = os.cpu_count() or 1
|
|
240
|
+
group_size = 5 if cores >= 5 else cores
|
|
241
|
+
for start in range(0, total_docs, group_size):
|
|
242
|
+
batch = list(
|
|
243
|
+
enumerate(documents[start : start + group_size], start=start + 1)
|
|
244
|
+
)
|
|
245
|
+
with ThreadPoolExecutor(max_workers=group_size) as executor:
|
|
246
|
+
futures = {
|
|
247
|
+
executor.submit(_process_one, doc, idx, total_docs): (idx, doc)
|
|
248
|
+
for idx, doc in batch
|
|
249
|
+
}
|
|
250
|
+
for future in as_completed(futures):
|
|
251
|
+
idx, doc = futures[future]
|
|
252
|
+
try:
|
|
253
|
+
node_chunks, failed_src = future.result()
|
|
254
|
+
if failed_src is None:
|
|
255
|
+
chunked_nodes.extend(node_chunks)
|
|
256
|
+
if show_progress:
|
|
257
|
+
src_path = doc.metadata.get(
|
|
258
|
+
"source_path", "in-memory-text"
|
|
259
|
+
)
|
|
260
|
+
print(
|
|
261
|
+
f"[{idx}/{total_docs}] β
Done: {src_path} (+{len(node_chunks)} node-chunks)"
|
|
262
|
+
)
|
|
263
|
+
else:
|
|
264
|
+
failed_paths.append(failed_src)
|
|
265
|
+
if show_progress:
|
|
266
|
+
print(f"[{idx}/{total_docs}] β Failed: {failed_src}")
|
|
267
|
+
except Exception as error:
|
|
268
|
+
failed_paths.append(
|
|
269
|
+
doc.metadata.get("source_path", "in-memory-text")
|
|
270
|
+
)
|
|
271
|
+
if show_progress:
|
|
272
|
+
print(
|
|
273
|
+
f"[{idx}/{total_docs}] β Failed with unexpected error: {error}"
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
if failed_paths:
|
|
277
|
+
print("The following files failed to process:")
|
|
278
|
+
for path in failed_paths:
|
|
279
|
+
print(f" - {path}")
|
|
280
|
+
|
|
281
|
+
if not chunked_nodes:
|
|
282
|
+
raise InvalidInputError("No documents could be split successfully.")
|
|
283
|
+
|
|
284
|
+
return chunked_nodes
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# ----------------------------------------------------------------
|
|
288
|
+
# Cheatsheet Retriever
|
|
289
|
+
# ----------------------------------------------------------------
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class PomaCheatsheetRetrieverLI(BaseRetriever):
|
|
293
|
+
|
|
294
|
+
def __init__(self, base: BaseRetriever):
|
|
295
|
+
"""Wrap an existing LlamaIndex retriever. Keep its callback/verbosity."""
|
|
296
|
+
if not isinstance(base, BaseRetriever):
|
|
297
|
+
raise ValueError("base must be an instance of BaseRetriever.")
|
|
298
|
+
super().__init__(
|
|
299
|
+
callback_manager=getattr(base, "callback_manager", None),
|
|
300
|
+
verbose=getattr(base, "_verbose", False),
|
|
301
|
+
)
|
|
302
|
+
self._base = base
|
|
303
|
+
|
|
304
|
+
def _retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
|
|
305
|
+
"""Retrieve chunkset nodes and generate cheatsheets for the given query."""
|
|
306
|
+
nodes = self._base.retrieve(query_bundle)
|
|
307
|
+
if not nodes:
|
|
308
|
+
return []
|
|
309
|
+
grouped: dict[str, list[NodeWithScore]] = defaultdict(list)
|
|
310
|
+
best_score: dict[str, float] = defaultdict(float)
|
|
311
|
+
for node in nodes:
|
|
312
|
+
doc_id = node.metadata["doc_id"]
|
|
313
|
+
grouped[doc_id].append(node)
|
|
314
|
+
best_score[doc_id] = max(best_score[doc_id], node.score or 1.0)
|
|
315
|
+
cheatsheet_nodes: list[NodeWithScore] = []
|
|
316
|
+
for doc_id, chunked_nodes in grouped.items():
|
|
317
|
+
cheatsheet = self._create_cheatsheet_llamaindex(chunked_nodes)
|
|
318
|
+
cheatsheet_node = TextNode(text=cheatsheet, metadata={"doc_id": doc_id})
|
|
319
|
+
cheatsheet_nodes.append(
|
|
320
|
+
NodeWithScore(node=cheatsheet_node, score=best_score[doc_id])
|
|
321
|
+
)
|
|
322
|
+
return cheatsheet_nodes
|
|
323
|
+
|
|
324
|
+
def as_query_engine(self, **kwargs):
|
|
325
|
+
"""Wrap as a LlamaIndex RetrieverQueryEngine."""
|
|
326
|
+
from llama_index.core.query_engine import RetrieverQueryEngine
|
|
327
|
+
|
|
328
|
+
return RetrieverQueryEngine(self, **kwargs)
|
|
329
|
+
|
|
330
|
+
def _create_cheatsheet_llamaindex(self, chunked_nodes: list[NodeWithScore]) -> str:
|
|
331
|
+
"""Generate a single deduplicated cheatsheet from chunked nodes."""
|
|
332
|
+
all_chunks = []
|
|
333
|
+
seen = set()
|
|
334
|
+
for node in chunked_nodes:
|
|
335
|
+
doc_id = node.metadata.get("doc_id", "unknown_doc")
|
|
336
|
+
chunks = node.metadata.get("chunks", [])
|
|
337
|
+
if not chunks:
|
|
338
|
+
continue
|
|
339
|
+
for chunk in chunks:
|
|
340
|
+
if not isinstance(chunk, dict):
|
|
341
|
+
continue
|
|
342
|
+
chunk_index = chunk.get("chunk_index")
|
|
343
|
+
if chunk_index is None or chunk_index in seen:
|
|
344
|
+
continue
|
|
345
|
+
seen.add(chunk_index)
|
|
346
|
+
chunk["tag"] = doc_id
|
|
347
|
+
all_chunks.append(chunk)
|
|
348
|
+
sorted_chunks = sorted(
|
|
349
|
+
all_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
|
|
350
|
+
)
|
|
351
|
+
cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
|
|
352
|
+
if (
|
|
353
|
+
not cheatsheets
|
|
354
|
+
or not isinstance(cheatsheets, list)
|
|
355
|
+
or len(cheatsheets) == 0
|
|
356
|
+
or "content" not in cheatsheets[0]
|
|
357
|
+
):
|
|
358
|
+
raise Exception(
|
|
359
|
+
"Unknown error; cheatsheet could not be created from input chunks."
|
|
360
|
+
)
|
|
361
|
+
return cheatsheets[0]["content"]
|
poma/retrieval.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# retrieval.py
|
|
2
|
+
from collections import defaultdict
|
|
3
|
+
from itertools import chain
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def generate_cheatsheets(
|
|
8
|
+
relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
|
|
9
|
+
) -> list[dict[str, Any]]:
|
|
10
|
+
chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
|
|
11
|
+
chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
|
|
12
|
+
relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
|
|
13
|
+
sorted_chunks = sorted(
|
|
14
|
+
relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
|
|
15
|
+
)
|
|
16
|
+
return _cheatsheets_from_chunks(sorted_chunks)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_single_cheatsheet(
|
|
20
|
+
relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
|
|
21
|
+
) -> str:
|
|
22
|
+
|
|
23
|
+
def prepare_single_doc_chunks(
|
|
24
|
+
chunk_dicts: list[dict[str, Any]],
|
|
25
|
+
) -> list[dict[str, Any]]:
|
|
26
|
+
# Make sure there are no duplicate chunk_index values
|
|
27
|
+
check_dict = defaultdict(set)
|
|
28
|
+
has_duplicates = any(
|
|
29
|
+
chunk["chunk_index"] in check_dict[chunk["tag"]]
|
|
30
|
+
or check_dict[chunk["tag"]].add(chunk["chunk_index"])
|
|
31
|
+
for chunk in chunk_dicts
|
|
32
|
+
)
|
|
33
|
+
if has_duplicates:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
"Duplicate chunk indices found in single document mode. "
|
|
36
|
+
"Each chunk must have a unique index."
|
|
37
|
+
)
|
|
38
|
+
# Use a fixed tag for chunks from single documents
|
|
39
|
+
for chunk_dict in chunk_dicts:
|
|
40
|
+
chunk_dict["tag"] = "single_doc"
|
|
41
|
+
return chunk_dicts
|
|
42
|
+
|
|
43
|
+
chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
|
|
44
|
+
chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
|
|
45
|
+
relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
|
|
46
|
+
relevant_chunks = prepare_single_doc_chunks(relevant_chunks)
|
|
47
|
+
sorted_chunks = sorted(
|
|
48
|
+
relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
|
|
49
|
+
)
|
|
50
|
+
cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
|
|
51
|
+
if (
|
|
52
|
+
not cheatsheets
|
|
53
|
+
or not isinstance(cheatsheets, list)
|
|
54
|
+
or len(cheatsheets) == 0
|
|
55
|
+
or "content" not in cheatsheets[0]
|
|
56
|
+
):
|
|
57
|
+
raise Exception(
|
|
58
|
+
"Unknown error; cheatsheet could not be created from input chunks."
|
|
59
|
+
)
|
|
60
|
+
return cheatsheets[0]["content"]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _get_relevant_chunks_for_ids(
|
|
64
|
+
chunk_ids: list[int],
|
|
65
|
+
chunks: list[dict[str, Any]],
|
|
66
|
+
) -> list[dict[str, Any]]:
|
|
67
|
+
chunk_indices_of_retrieved_chunksets = chunk_ids
|
|
68
|
+
all_chunks_of_doc = chunks
|
|
69
|
+
|
|
70
|
+
# Build helpers
|
|
71
|
+
sorted_chunks = sorted(all_chunks_of_doc, key=lambda c: c["chunk_index"])
|
|
72
|
+
index_to_chunk = {c["chunk_index"]: c for c in sorted_chunks}
|
|
73
|
+
index_to_depth = {c["chunk_index"]: c["depth"] for c in sorted_chunks}
|
|
74
|
+
|
|
75
|
+
# Find relatively deepest indices in the retrieval
|
|
76
|
+
candidate_indices = set(chunk_indices_of_retrieved_chunksets)
|
|
77
|
+
|
|
78
|
+
def is_ancestor(idx1, idx2):
|
|
79
|
+
"""True if idx1 is an ancestor of idx2."""
|
|
80
|
+
# idx1 must be before idx2 and have smaller depth
|
|
81
|
+
if idx1 >= idx2:
|
|
82
|
+
return False
|
|
83
|
+
depth1 = index_to_depth[idx1]
|
|
84
|
+
depth2 = index_to_depth[idx2]
|
|
85
|
+
if depth1 >= depth2:
|
|
86
|
+
return False
|
|
87
|
+
# scan from idx1+1 up to idx2, making sure all are deeper than depth1 until idx2
|
|
88
|
+
for i in range(idx1 + 1, idx2 + 1):
|
|
89
|
+
depth = index_to_depth[sorted_chunks[i]["chunk_index"]]
|
|
90
|
+
if depth <= depth1 and sorted_chunks[i]["chunk_index"] != idx2:
|
|
91
|
+
return False
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
# Exclude any index that is an ancestor of another in the set
|
|
95
|
+
relatively_deepest = set(candidate_indices)
|
|
96
|
+
for idx1 in candidate_indices:
|
|
97
|
+
for idx2 in candidate_indices:
|
|
98
|
+
if idx1 != idx2 and is_ancestor(idx1, idx2):
|
|
99
|
+
relatively_deepest.discard(idx1)
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
# Standard subtree/parent finding routines
|
|
103
|
+
def get_child_indices(chunk_index: int) -> list[int]:
|
|
104
|
+
base_depth = index_to_depth[chunk_index]
|
|
105
|
+
children = []
|
|
106
|
+
for i in range(chunk_index + 1, len(sorted_chunks)):
|
|
107
|
+
idx = sorted_chunks[i]["chunk_index"]
|
|
108
|
+
depth = sorted_chunks[i]["depth"]
|
|
109
|
+
if depth <= base_depth:
|
|
110
|
+
break
|
|
111
|
+
children.append(idx)
|
|
112
|
+
return children
|
|
113
|
+
|
|
114
|
+
def get_parent_indices(chunk_index: int) -> list[int]:
|
|
115
|
+
parents = []
|
|
116
|
+
current_depth = index_to_depth[chunk_index]
|
|
117
|
+
for i in range(chunk_index - 1, -1, -1):
|
|
118
|
+
idx = sorted_chunks[i]["chunk_index"]
|
|
119
|
+
depth = sorted_chunks[i]["depth"]
|
|
120
|
+
if depth < current_depth:
|
|
121
|
+
parents.append(idx)
|
|
122
|
+
current_depth = depth
|
|
123
|
+
return parents[::-1] # root -> leaf order
|
|
124
|
+
|
|
125
|
+
# Collect all relevant indices
|
|
126
|
+
all_indices = set(
|
|
127
|
+
chunk_indices_of_retrieved_chunksets
|
|
128
|
+
) # always include all search hits
|
|
129
|
+
for idx in relatively_deepest:
|
|
130
|
+
all_indices.update(get_child_indices(idx))
|
|
131
|
+
|
|
132
|
+
# Parents for all found nodes
|
|
133
|
+
for idx in list(all_indices):
|
|
134
|
+
all_indices.update(get_parent_indices(idx))
|
|
135
|
+
|
|
136
|
+
# Return in doc order
|
|
137
|
+
return [index_to_chunk[i] for i in sorted(all_indices)]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _cheatsheets_from_chunks(
|
|
141
|
+
content_chunks: list[dict[str, Any]],
|
|
142
|
+
) -> list[dict[str, Any]]:
|
|
143
|
+
cheatsheets: list[dict] = []
|
|
144
|
+
|
|
145
|
+
compressed_data = {}
|
|
146
|
+
for chunk in content_chunks:
|
|
147
|
+
if chunk["tag"] not in compressed_data:
|
|
148
|
+
# If there is data stored for a previous tag, save it to the cheatsheets list
|
|
149
|
+
if compressed_data:
|
|
150
|
+
for key, value in compressed_data.items():
|
|
151
|
+
cheatsheets.append({"tag": key, "content": value["content"]})
|
|
152
|
+
# Clear the compressed_data for the current tag
|
|
153
|
+
compressed_data.clear()
|
|
154
|
+
# Start a new entry for the current tag
|
|
155
|
+
compressed_data[chunk["tag"]] = {
|
|
156
|
+
"content": chunk["content"],
|
|
157
|
+
"last_chunk": chunk["chunk_index"],
|
|
158
|
+
}
|
|
159
|
+
else:
|
|
160
|
+
# Check if chunks are consecutive
|
|
161
|
+
if (
|
|
162
|
+
chunk["chunk_index"]
|
|
163
|
+
== int(compressed_data[chunk["tag"]]["last_chunk"]) + 1
|
|
164
|
+
):
|
|
165
|
+
compressed_data[chunk["tag"]]["content"] += "\n" + chunk["content"]
|
|
166
|
+
else:
|
|
167
|
+
compressed_data[chunk["tag"]]["content"] += "\n[β¦]\n" + chunk["content"]
|
|
168
|
+
# Update the last chunk index
|
|
169
|
+
compressed_data[chunk["tag"]]["last_chunk"] = chunk["chunk_index"]
|
|
170
|
+
|
|
171
|
+
# Save the last processed entry to the cheatsheets list
|
|
172
|
+
if compressed_data:
|
|
173
|
+
for key, value in compressed_data.items():
|
|
174
|
+
cheatsheets.append({"tag": key, "content": value["content"]})
|
|
175
|
+
|
|
176
|
+
return cheatsheets
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: poma
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: Official Python SDK for the Poma document-processing API
|
|
5
|
+
Author-email: "POMA AI GmbH, Berlin" <sdk@poma-ai.com>
|
|
6
|
+
License-Expression: MPL-2.0
|
|
7
|
+
Keywords: chunking,structure,rag,poma,documents,ai
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: httpx==0.28.1
|
|
12
|
+
Requires-Dist: pydantic==2.11.7
|
|
13
|
+
Provides-Extra: integrations
|
|
14
|
+
Requires-Dist: langchain==0.3.27; extra == "integrations"
|
|
15
|
+
Requires-Dist: langchain-text-splitters==0.3.9; extra == "integrations"
|
|
16
|
+
Requires-Dist: llama-index==0.13.0; extra == "integrations"
|
|
17
|
+
Provides-Extra: integration-examples
|
|
18
|
+
Requires-Dist: langchain==0.3.27; extra == "integration-examples"
|
|
19
|
+
Requires-Dist: langchain-text-splitters==0.3.9; extra == "integration-examples"
|
|
20
|
+
Requires-Dist: llama-index==0.13.0; extra == "integration-examples"
|
|
21
|
+
Requires-Dist: llama-index-vector-stores-faiss==0.5.0; extra == "integration-examples"
|
|
22
|
+
Requires-Dist: faiss-cpu==1.10.0; extra == "integration-examples"
|
|
23
|
+
Requires-Dist: langchain_openai==0.3.28; extra == "integration-examples"
|
|
24
|
+
Requires-Dist: langchain_community==0.3.27; extra == "integration-examples"
|
|
25
|
+
Requires-Dist: llama-index-embeddings-langchain==0.4.0; extra == "integration-examples"
|
|
26
|
+
Requires-Dist: dotenv; extra == "integration-examples"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+

|
|
30
|
+
# π POMA: Preserving Optimal Markdown Architecture
|
|
31
|
+
|
|
32
|
+
## πQuick-Start Guide
|
|
33
|
+
|
|
34
|
+
### Installation
|
|
35
|
+
|
|
36
|
+
Requires Python 3.10+. Install the core packages:
|
|
37
|
+
```bash
|
|
38
|
+
pip install poma
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
For integrations into LangChain and LlamaIndex:
|
|
42
|
+
```bash
|
|
43
|
+
pip install poma[integrations]
|
|
44
|
+
# Or LangChain/LlamaIndex including example extras:
|
|
45
|
+
pip install poma[integration-examples]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
- You may also want: `pip install python-dotenv` to load API keys from a .env file.
|
|
50
|
+
- API keys required (POMA_API_KEY) for the POMA AI client via environment variables.
|
|
51
|
+
- **To request a POMA_API_KEY, please contact us at api@poma-ai.com**
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
### Example Implementations β all examples, integrations, and additional information can be found in our GitHub repository: [poma-ai/poma](https://github.com/poma-ai/)
|
|
55
|
+
|
|
56
|
+
We provide four example implementations to help you get started with POMA AI:
|
|
57
|
+
- example.py β A standalone implementation for documents, showing the basic POMA AI workflow with simple keyword-based retrieval
|
|
58
|
+
- example_langchain.py β Integration with LangChain, demonstrating how easy it is to use POMA AI with LangChain
|
|
59
|
+
- example_llamaindex.py β Integration with LlamaIndex, showing how simple it is to use POMA AI with LlamaIndex
|
|
60
|
+
|
|
61
|
+
*Note: The integration examples use OpenAI embeddings. Make sure to set your OPENAI_API_KEY environment variable, or replace the embeddings with your preferred ones.*
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
All examples follow the same two-phase process (ingest β retrieve) but demonstrate different integration options for your RAG pipeline.
|
|
65
|
+
|
|
66
|
+
! Please do NOT send any sensitive and/or personal information to POMA AI endpoints without having a signed contract & DPA !
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
poma/__init__.py,sha256=SARVBTJw2pkIXR2_OYMPYjB7W335er_2-9j4yhzVTZI,266
|
|
2
|
+
poma/client.py,sha256=HOlNVfSPYzq1RjLVUtIqbObH5QSLRbi7KnbCnBjpUh4,14201
|
|
3
|
+
poma/exceptions.py,sha256=5d8SdIIRFotKUJJAy9mct2q44oEmAsR15OVEmkLDfkQ,518
|
|
4
|
+
poma/retrieval.py,sha256=bm68_1QscJXa76sxVuAkcwdeGsvEZaQkrY3-3uUxrIg,6730
|
|
5
|
+
poma/integrations/__init__.py,sha256=xrrJluggTLtrKs4jLOZUWkFENqWSHSnhCqYQYY51kq0,405
|
|
6
|
+
poma/integrations/langchain_poma.py,sha256=IL3pWWGCEK_O0JagpnKPFRwKclyNTwPcaTTdKJkYfYY,14608
|
|
7
|
+
poma/integrations/llamaindex_poma.py,sha256=n3M71QXGVA2RTsUC24ZTt__VHEgsTbIW9BVwEn1Xxbg,14868
|
|
8
|
+
poma-0.0.0.dist-info/licenses/LICENSE,sha256=YRzZ4sQOWV3ut0G4LHZJ2hT90shzZufGlXoIx4LWFEo,15254
|
|
9
|
+
poma-0.0.0.dist-info/METADATA,sha256=JRBqFlkSlEQQEkotQmR02vRniUWeFlkKQ2bjdTGr7ZU,3079
|
|
10
|
+
poma-0.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
11
|
+
poma-0.0.0.dist-info/top_level.txt,sha256=f_3c5Y6SojNnH0iiiE898fIKF6R2LqWyAw-BGi-72YI,5
|
|
12
|
+
poma-0.0.0.dist-info/RECORD,,
|