poma 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,361 @@
1
+ # ---------------------------------------------------------------------
2
+ # POMA integration for LlamaIndex
3
+ # ---------------------------------------------------------------------
4
+
5
+ import os
6
+ import hashlib
7
+ from typing import Any
8
+ from pathlib import Path
9
+ from collections import defaultdict
10
+ from collections.abc import Sequence, Iterable
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+
13
+ from llama_index.core.readers.base import BaseReader
14
+ from llama_index.core.node_parser import NodeParser
15
+ from llama_index.core.retrievers import BaseRetriever
16
+ from llama_index.core.schema import (
17
+ Document,
18
+ NodeWithScore,
19
+ TextNode,
20
+ BaseNode,
21
+ QueryBundle,
22
+ )
23
+ from pydantic import PrivateAttr
24
+
25
+ from poma import Poma
26
+ from poma.client import ALLOWED_FILE_EXTENSIONS
27
+ from poma.retrieval import _cheatsheets_from_chunks
28
+ from poma.exceptions import InvalidInputError
29
+
30
+ __all__ = ["PomaFileReader", "PomaChunksetNodeParser", "PomaCheatsheetRetrieverLI"]
31
+
32
+
33
+ # ------------------------------------------------------------------ #
34
+ # Load from Path β†’ LI Documents #
35
+ # ------------------------------------------------------------------ #
36
+
37
+
38
+ class PomaFileReader(BaseReader):
39
+
40
+ def load_data(self, input_path: str | Path) -> list[Document]:
41
+ """
42
+ Load files from the input path (file or directory) into LlamaIndex Documents.
43
+ Only files with allowed extensions are processed; others are skipped.
44
+ """
45
+ path = Path(input_path).expanduser().resolve()
46
+ if not path.exists():
47
+ raise FileNotFoundError(f"No such path: {path}")
48
+
49
+ documents: list[Document] = []
50
+ skipped: int = 0
51
+
52
+ def _process_file(file_path: Path):
53
+ nonlocal skipped, documents
54
+ if not file_path.is_file():
55
+ return
56
+ file_extension = file_path.suffix.lower()
57
+ if not file_extension or file_extension not in ALLOWED_FILE_EXTENSIONS:
58
+ skipped += 1
59
+ return
60
+ file_bytes = file_path.read_bytes()
61
+ file_hash = hashlib.md5(file_bytes).hexdigest()
62
+ if file_extension == ".pdf":
63
+ # LlamaIndex requires `text` to be str.
64
+ # Actual file processing happens downstream in the node parser.
65
+ text_payload: str = ""
66
+ else:
67
+ try:
68
+ text_payload = file_bytes.decode("utf-8")
69
+ except UnicodeDecodeError:
70
+ skipped += 1
71
+ return
72
+ documents.append(
73
+ Document(
74
+ text=text_payload,
75
+ metadata={
76
+ "source_path": str(file_path),
77
+ "doc_id": f"{file_hash}",
78
+ },
79
+ )
80
+ )
81
+
82
+ if path.is_file():
83
+ _process_file(path)
84
+ elif path.is_dir():
85
+ for path_in_dir in sorted(path.rglob("*")):
86
+ _process_file(path_in_dir)
87
+ else:
88
+ raise FileNotFoundError(f"Unsupported path type (not file/dir): {path}")
89
+
90
+ allowed = ", ".join(sorted(ALLOWED_FILE_EXTENSIONS))
91
+ if not documents:
92
+ raise InvalidInputError(f"No supported files found. Allowed: {allowed}")
93
+ if skipped > 0:
94
+ print(
95
+ f"Skipped {skipped} file(s) due to unsupported or unreadable type. Allowed: {allowed}"
96
+ )
97
+ return documents
98
+
99
+
100
+ # ------------------------------------------------------------------ #
101
+ # Generate Chunksets #
102
+ # ------------------------------------------------------------------ #
103
+
104
+
105
+ class PomaChunksetNodeParser(NodeParser):
106
+ # """Calls **POMA API** for each document, choosing text vs file ingestion as needed."""
107
+
108
+ _client: Poma = PrivateAttr()
109
+
110
+ def __init__(self, *, client: Poma):
111
+ """Initialize with Poma client instance."""
112
+ super().__init__()
113
+ self._client = client
114
+
115
+ def _parse_nodes(
116
+ self, nodes: Sequence[BaseNode], show_progress: bool = False, **kwargs: Any
117
+ ) -> list[BaseNode]:
118
+ """Not implemented, use _get_nodes_from_documents()."""
119
+ raise NotImplementedError("Not implemented, use _get_nodes_from_documents().")
120
+
121
+ def _get_nodes_from_documents(
122
+ self, documents: Sequence[Document], show_progress: bool = False
123
+ ) -> list[BaseNode]:
124
+ """
125
+ Convert LlamaIndex Documents into chunkset Nodes via POMA API.
126
+ Each output Node represents a chunkset, with associated chunks in metadata.
127
+ """
128
+
129
+ documents = list(documents)
130
+ if not documents:
131
+ raise InvalidInputError("No documents provided to process.")
132
+
133
+ total_docs = len(documents)
134
+ chunked_nodes: list[BaseNode] = []
135
+ failed_paths: list[str] = []
136
+
137
+ def _safe_int(value: object) -> int | None:
138
+ if isinstance(value, bool):
139
+ return None
140
+ if isinstance(value, int):
141
+ return value
142
+ if isinstance(value, str):
143
+ try:
144
+ return int(value.strip())
145
+ except Exception:
146
+ return None
147
+ try:
148
+ return int(value) # type: ignore[arg-type]
149
+ except Exception:
150
+ return None
151
+
152
+ def _doc_id_and_src(doc: Document) -> tuple[str, str]:
153
+ src_path = doc.metadata.get("source_path", "in-memory-text")
154
+ doc_id = doc.metadata.get("doc_id") or Path(src_path).stem or "unknown-doc"
155
+ return doc_id, src_path
156
+
157
+ def _process_one(
158
+ poma_doc: Document, doc_idx: int, total_docs: int
159
+ ) -> tuple[list[BaseNode], str | None]:
160
+ """Process a single document via POMA API, return chunkset nodes or failed source path."""
161
+ try:
162
+ doc_id, src_path = _doc_id_and_src(poma_doc)
163
+ path_obj: Path | None = None
164
+ if src_path and src_path.strip() and isinstance(src_path, str):
165
+ try:
166
+ path = Path(src_path).expanduser().resolve()
167
+ if path.exists():
168
+ path_obj = path
169
+ except Exception:
170
+ path_obj = None
171
+ if not path_obj:
172
+ raise InvalidInputError(
173
+ "No valid source_path found in document metadata."
174
+ )
175
+ start_result = self._client.start_chunk_file(path_obj, base_url=None)
176
+ job_id = start_result.get("job_id")
177
+ if not job_id:
178
+ raise RuntimeError("Failed to receive job ID from server.")
179
+ if show_progress:
180
+ print(
181
+ f"[{doc_idx}/{total_docs}] ⏳ Job {job_id} started for: {src_path}. Polling for results..."
182
+ )
183
+ result = self._client.get_chunk_result(
184
+ str(job_id), show_progress=show_progress
185
+ )
186
+ chunks: list[dict] = result.get("chunks", [])
187
+ chunksets: list[dict] = result.get("chunksets", [])
188
+ except Exception as exception:
189
+ print(
190
+ f"[{doc_idx}/{total_docs}] ❌ Exception chunking document: {exception}"
191
+ )
192
+ src_path = poma_doc.metadata.get("source_path", "in-memory-text")
193
+ return [], src_path
194
+
195
+ file_nodes: list[BaseNode] = []
196
+ try:
197
+ chunks_by_index: dict[int, dict] = {}
198
+ for chunk in chunks:
199
+ idx = _safe_int(chunk.get("chunk_index"))
200
+ if idx is not None:
201
+ chunks_by_index[idx] = chunk
202
+ for cs in chunksets:
203
+ chunkset_index = cs.get("chunkset_index")
204
+ chunks_indices = cs.get("chunks", []) or []
205
+ normalized_indices: list[int] = []
206
+ for chunk_index in chunks_indices:
207
+ idx = _safe_int(chunk_index)
208
+ if idx is not None:
209
+ normalized_indices.append(idx)
210
+ relevant_chunks = [
211
+ chunks_by_index[idx]
212
+ for idx in normalized_indices
213
+ if idx in chunks_by_index
214
+ ]
215
+ text_node = TextNode(
216
+ text=cs.get("contents", ""),
217
+ metadata={
218
+ "doc_id": doc_id,
219
+ "chunkset_index": chunkset_index,
220
+ "chunks": relevant_chunks,
221
+ "chunkset": cs,
222
+ "source_path": src_path,
223
+ },
224
+ )
225
+ # Keep embeddings clean – just embed content, not metadata
226
+ text_node.excluded_embed_metadata_keys = list(
227
+ text_node.metadata.keys()
228
+ )
229
+ file_nodes.append(text_node)
230
+ except Exception as exception:
231
+ print(
232
+ f"[{doc_idx}/{total_docs}] ❌ Exception processing chunking result: {exception}"
233
+ )
234
+ src_path = poma_doc.metadata.get("source_path", "in-memory-text")
235
+ return [], src_path
236
+ return file_nodes, None
237
+
238
+ # parallel processing of documents
239
+ cores = os.cpu_count() or 1
240
+ group_size = 5 if cores >= 5 else cores
241
+ for start in range(0, total_docs, group_size):
242
+ batch = list(
243
+ enumerate(documents[start : start + group_size], start=start + 1)
244
+ )
245
+ with ThreadPoolExecutor(max_workers=group_size) as executor:
246
+ futures = {
247
+ executor.submit(_process_one, doc, idx, total_docs): (idx, doc)
248
+ for idx, doc in batch
249
+ }
250
+ for future in as_completed(futures):
251
+ idx, doc = futures[future]
252
+ try:
253
+ node_chunks, failed_src = future.result()
254
+ if failed_src is None:
255
+ chunked_nodes.extend(node_chunks)
256
+ if show_progress:
257
+ src_path = doc.metadata.get(
258
+ "source_path", "in-memory-text"
259
+ )
260
+ print(
261
+ f"[{idx}/{total_docs}] βœ… Done: {src_path} (+{len(node_chunks)} node-chunks)"
262
+ )
263
+ else:
264
+ failed_paths.append(failed_src)
265
+ if show_progress:
266
+ print(f"[{idx}/{total_docs}] ❌ Failed: {failed_src}")
267
+ except Exception as error:
268
+ failed_paths.append(
269
+ doc.metadata.get("source_path", "in-memory-text")
270
+ )
271
+ if show_progress:
272
+ print(
273
+ f"[{idx}/{total_docs}] ❌ Failed with unexpected error: {error}"
274
+ )
275
+
276
+ if failed_paths:
277
+ print("The following files failed to process:")
278
+ for path in failed_paths:
279
+ print(f" - {path}")
280
+
281
+ if not chunked_nodes:
282
+ raise InvalidInputError("No documents could be split successfully.")
283
+
284
+ return chunked_nodes
285
+
286
+
287
+ # ----------------------------------------------------------------
288
+ # Cheatsheet Retriever
289
+ # ----------------------------------------------------------------
290
+
291
+
292
+ class PomaCheatsheetRetrieverLI(BaseRetriever):
293
+
294
+ def __init__(self, base: BaseRetriever):
295
+ """Wrap an existing LlamaIndex retriever. Keep its callback/verbosity."""
296
+ if not isinstance(base, BaseRetriever):
297
+ raise ValueError("base must be an instance of BaseRetriever.")
298
+ super().__init__(
299
+ callback_manager=getattr(base, "callback_manager", None),
300
+ verbose=getattr(base, "_verbose", False),
301
+ )
302
+ self._base = base
303
+
304
+ def _retrieve(self, query_bundle: QueryBundle) -> list[NodeWithScore]:
305
+ """Retrieve chunkset nodes and generate cheatsheets for the given query."""
306
+ nodes = self._base.retrieve(query_bundle)
307
+ if not nodes:
308
+ return []
309
+ grouped: dict[str, list[NodeWithScore]] = defaultdict(list)
310
+ best_score: dict[str, float] = defaultdict(float)
311
+ for node in nodes:
312
+ doc_id = node.metadata["doc_id"]
313
+ grouped[doc_id].append(node)
314
+ best_score[doc_id] = max(best_score[doc_id], node.score or 1.0)
315
+ cheatsheet_nodes: list[NodeWithScore] = []
316
+ for doc_id, chunked_nodes in grouped.items():
317
+ cheatsheet = self._create_cheatsheet_llamaindex(chunked_nodes)
318
+ cheatsheet_node = TextNode(text=cheatsheet, metadata={"doc_id": doc_id})
319
+ cheatsheet_nodes.append(
320
+ NodeWithScore(node=cheatsheet_node, score=best_score[doc_id])
321
+ )
322
+ return cheatsheet_nodes
323
+
324
+ def as_query_engine(self, **kwargs):
325
+ """Wrap as a LlamaIndex RetrieverQueryEngine."""
326
+ from llama_index.core.query_engine import RetrieverQueryEngine
327
+
328
+ return RetrieverQueryEngine(self, **kwargs)
329
+
330
+ def _create_cheatsheet_llamaindex(self, chunked_nodes: list[NodeWithScore]) -> str:
331
+ """Generate a single deduplicated cheatsheet from chunked nodes."""
332
+ all_chunks = []
333
+ seen = set()
334
+ for node in chunked_nodes:
335
+ doc_id = node.metadata.get("doc_id", "unknown_doc")
336
+ chunks = node.metadata.get("chunks", [])
337
+ if not chunks:
338
+ continue
339
+ for chunk in chunks:
340
+ if not isinstance(chunk, dict):
341
+ continue
342
+ chunk_index = chunk.get("chunk_index")
343
+ if chunk_index is None or chunk_index in seen:
344
+ continue
345
+ seen.add(chunk_index)
346
+ chunk["tag"] = doc_id
347
+ all_chunks.append(chunk)
348
+ sorted_chunks = sorted(
349
+ all_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
350
+ )
351
+ cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
352
+ if (
353
+ not cheatsheets
354
+ or not isinstance(cheatsheets, list)
355
+ or len(cheatsheets) == 0
356
+ or "content" not in cheatsheets[0]
357
+ ):
358
+ raise Exception(
359
+ "Unknown error; cheatsheet could not be created from input chunks."
360
+ )
361
+ return cheatsheets[0]["content"]
poma/retrieval.py ADDED
@@ -0,0 +1,176 @@
1
+ # retrieval.py
2
+ from collections import defaultdict
3
+ from itertools import chain
4
+ from typing import Any
5
+
6
+
7
+ def generate_cheatsheets(
8
+ relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
9
+ ) -> list[dict[str, Any]]:
10
+ chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
11
+ chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
12
+ relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
13
+ sorted_chunks = sorted(
14
+ relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
15
+ )
16
+ return _cheatsheets_from_chunks(sorted_chunks)
17
+
18
+
19
+ def generate_single_cheatsheet(
20
+ relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
21
+ ) -> str:
22
+
23
+ def prepare_single_doc_chunks(
24
+ chunk_dicts: list[dict[str, Any]],
25
+ ) -> list[dict[str, Any]]:
26
+ # Make sure there are no duplicate chunk_index values
27
+ check_dict = defaultdict(set)
28
+ has_duplicates = any(
29
+ chunk["chunk_index"] in check_dict[chunk["tag"]]
30
+ or check_dict[chunk["tag"]].add(chunk["chunk_index"])
31
+ for chunk in chunk_dicts
32
+ )
33
+ if has_duplicates:
34
+ raise ValueError(
35
+ "Duplicate chunk indices found in single document mode. "
36
+ "Each chunk must have a unique index."
37
+ )
38
+ # Use a fixed tag for chunks from single documents
39
+ for chunk_dict in chunk_dicts:
40
+ chunk_dict["tag"] = "single_doc"
41
+ return chunk_dicts
42
+
43
+ chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
44
+ chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
45
+ relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
46
+ relevant_chunks = prepare_single_doc_chunks(relevant_chunks)
47
+ sorted_chunks = sorted(
48
+ relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
49
+ )
50
+ cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
51
+ if (
52
+ not cheatsheets
53
+ or not isinstance(cheatsheets, list)
54
+ or len(cheatsheets) == 0
55
+ or "content" not in cheatsheets[0]
56
+ ):
57
+ raise Exception(
58
+ "Unknown error; cheatsheet could not be created from input chunks."
59
+ )
60
+ return cheatsheets[0]["content"]
61
+
62
+
63
+ def _get_relevant_chunks_for_ids(
64
+ chunk_ids: list[int],
65
+ chunks: list[dict[str, Any]],
66
+ ) -> list[dict[str, Any]]:
67
+ chunk_indices_of_retrieved_chunksets = chunk_ids
68
+ all_chunks_of_doc = chunks
69
+
70
+ # Build helpers
71
+ sorted_chunks = sorted(all_chunks_of_doc, key=lambda c: c["chunk_index"])
72
+ index_to_chunk = {c["chunk_index"]: c for c in sorted_chunks}
73
+ index_to_depth = {c["chunk_index"]: c["depth"] for c in sorted_chunks}
74
+
75
+ # Find relatively deepest indices in the retrieval
76
+ candidate_indices = set(chunk_indices_of_retrieved_chunksets)
77
+
78
+ def is_ancestor(idx1, idx2):
79
+ """True if idx1 is an ancestor of idx2."""
80
+ # idx1 must be before idx2 and have smaller depth
81
+ if idx1 >= idx2:
82
+ return False
83
+ depth1 = index_to_depth[idx1]
84
+ depth2 = index_to_depth[idx2]
85
+ if depth1 >= depth2:
86
+ return False
87
+ # scan from idx1+1 up to idx2, making sure all are deeper than depth1 until idx2
88
+ for i in range(idx1 + 1, idx2 + 1):
89
+ depth = index_to_depth[sorted_chunks[i]["chunk_index"]]
90
+ if depth <= depth1 and sorted_chunks[i]["chunk_index"] != idx2:
91
+ return False
92
+ return True
93
+
94
+ # Exclude any index that is an ancestor of another in the set
95
+ relatively_deepest = set(candidate_indices)
96
+ for idx1 in candidate_indices:
97
+ for idx2 in candidate_indices:
98
+ if idx1 != idx2 and is_ancestor(idx1, idx2):
99
+ relatively_deepest.discard(idx1)
100
+ break
101
+
102
+ # Standard subtree/parent finding routines
103
+ def get_child_indices(chunk_index: int) -> list[int]:
104
+ base_depth = index_to_depth[chunk_index]
105
+ children = []
106
+ for i in range(chunk_index + 1, len(sorted_chunks)):
107
+ idx = sorted_chunks[i]["chunk_index"]
108
+ depth = sorted_chunks[i]["depth"]
109
+ if depth <= base_depth:
110
+ break
111
+ children.append(idx)
112
+ return children
113
+
114
+ def get_parent_indices(chunk_index: int) -> list[int]:
115
+ parents = []
116
+ current_depth = index_to_depth[chunk_index]
117
+ for i in range(chunk_index - 1, -1, -1):
118
+ idx = sorted_chunks[i]["chunk_index"]
119
+ depth = sorted_chunks[i]["depth"]
120
+ if depth < current_depth:
121
+ parents.append(idx)
122
+ current_depth = depth
123
+ return parents[::-1] # root -> leaf order
124
+
125
+ # Collect all relevant indices
126
+ all_indices = set(
127
+ chunk_indices_of_retrieved_chunksets
128
+ ) # always include all search hits
129
+ for idx in relatively_deepest:
130
+ all_indices.update(get_child_indices(idx))
131
+
132
+ # Parents for all found nodes
133
+ for idx in list(all_indices):
134
+ all_indices.update(get_parent_indices(idx))
135
+
136
+ # Return in doc order
137
+ return [index_to_chunk[i] for i in sorted(all_indices)]
138
+
139
+
140
+ def _cheatsheets_from_chunks(
141
+ content_chunks: list[dict[str, Any]],
142
+ ) -> list[dict[str, Any]]:
143
+ cheatsheets: list[dict] = []
144
+
145
+ compressed_data = {}
146
+ for chunk in content_chunks:
147
+ if chunk["tag"] not in compressed_data:
148
+ # If there is data stored for a previous tag, save it to the cheatsheets list
149
+ if compressed_data:
150
+ for key, value in compressed_data.items():
151
+ cheatsheets.append({"tag": key, "content": value["content"]})
152
+ # Clear the compressed_data for the current tag
153
+ compressed_data.clear()
154
+ # Start a new entry for the current tag
155
+ compressed_data[chunk["tag"]] = {
156
+ "content": chunk["content"],
157
+ "last_chunk": chunk["chunk_index"],
158
+ }
159
+ else:
160
+ # Check if chunks are consecutive
161
+ if (
162
+ chunk["chunk_index"]
163
+ == int(compressed_data[chunk["tag"]]["last_chunk"]) + 1
164
+ ):
165
+ compressed_data[chunk["tag"]]["content"] += "\n" + chunk["content"]
166
+ else:
167
+ compressed_data[chunk["tag"]]["content"] += "\n[…]\n" + chunk["content"]
168
+ # Update the last chunk index
169
+ compressed_data[chunk["tag"]]["last_chunk"] = chunk["chunk_index"]
170
+
171
+ # Save the last processed entry to the cheatsheets list
172
+ if compressed_data:
173
+ for key, value in compressed_data.items():
174
+ cheatsheets.append({"tag": key, "content": value["content"]})
175
+
176
+ return cheatsheets
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: poma
3
+ Version: 0.1.0
4
+ Summary: Official Python SDK for the Poma document-processing API
5
+ Author-email: "POMA AI GmbH, Berlin" <sdk@poma-ai.com>
6
+ License-Expression: MPL-2.0
7
+ Keywords: chunking,structure,rag,poma,documents,ai
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: httpx==0.28.1
12
+ Requires-Dist: pydantic==2.11.7
13
+ Provides-Extra: integrations
14
+ Requires-Dist: langchain==0.3.27; extra == "integrations"
15
+ Requires-Dist: langchain-text-splitters==0.3.9; extra == "integrations"
16
+ Requires-Dist: llama-index==0.13.0; extra == "integrations"
17
+ Provides-Extra: integration-examples
18
+ Requires-Dist: langchain==0.3.27; extra == "integration-examples"
19
+ Requires-Dist: langchain-text-splitters==0.3.9; extra == "integration-examples"
20
+ Requires-Dist: llama-index==0.13.0; extra == "integration-examples"
21
+ Requires-Dist: llama-index-vector-stores-faiss==0.5.0; extra == "integration-examples"
22
+ Requires-Dist: faiss-cpu==1.10.0; extra == "integration-examples"
23
+ Requires-Dist: langchain_openai==0.3.28; extra == "integration-examples"
24
+ Requires-Dist: langchain_community==0.3.27; extra == "integration-examples"
25
+ Requires-Dist: llama-index-embeddings-langchain==0.4.0; extra == "integration-examples"
26
+ Requires-Dist: dotenv; extra == "integration-examples"
27
+ Provides-Extra: examples
28
+ Requires-Dist: dotenv; extra == "examples"
29
+ Dynamic: license-file
30
+
31
+ ![POMA AI Logo](https://raw.githubusercontent.com/poma-ai/.github/main/assets/POMA_AI_Logo_Pink.svg)
32
+ # πŸ“š POMA: Preserving Optimal Markdown Architecture
33
+
34
+ ## πŸš€Quick-Start Guide
35
+
36
+ ### Installation
37
+
38
+ Requires Python 3.10+. Install the core packages:
39
+ ```bash
40
+ pip install poma
41
+ ```
42
+
43
+ For integrations into LangChain and LlamaIndex:
44
+ ```bash
45
+ pip install poma[integrations]
46
+ # Or LangChain/LlamaIndex including example extras:
47
+ pip install poma[integration-examples]
48
+ ```
49
+
50
+
51
+ - You may also want: `pip install python-dotenv` to load API keys from a .env file.
52
+ - API keys required (POMA_API_KEY) for the POMA AI client via environment variables.
53
+ - **To request a POMA_API_KEY, please contact us at api@poma-ai.com**
54
+
55
+
56
+ ### Example Implementations β€” all examples, integrations, and additional information can be found in our GitHub repository: [poma-ai/poma](https://github.com/poma-ai/poma)
57
+
58
+ We provide four example implementations to help you get started with POMA AI:
59
+ - example.py β€” A standalone implementation for documents, showing the basic POMA AI workflow with simple keyword-based retrieval
60
+ - example_langchain.py β€” Integration with LangChain, demonstrating how easy it is to use POMA AI with LangChain
61
+ - example_llamaindex.py β€” Integration with LlamaIndex, showing how simple it is to use POMA AI with LlamaIndex
62
+
63
+ *Note: The integration examples use OpenAI embeddings. Make sure to set your OPENAI_API_KEY environment variable, or replace the embeddings with your preferred ones.*
64
+
65
+
66
+ All examples follow the same two-phase process (ingest β†’ retrieve) but demonstrate different integration options for your RAG pipeline.
67
+
68
+ ! Please do NOT send any sensitive and/or personal information to POMA AI endpoints without having a signed contract & DPA !
@@ -0,0 +1,12 @@
1
+ poma/__init__.py,sha256=SARVBTJw2pkIXR2_OYMPYjB7W335er_2-9j4yhzVTZI,266
2
+ poma/client.py,sha256=l4folcs_vkfBoKqLQSPbSq3RnAbBFLKjvrOQ9Obhhvg,14202
3
+ poma/exceptions.py,sha256=5d8SdIIRFotKUJJAy9mct2q44oEmAsR15OVEmkLDfkQ,518
4
+ poma/retrieval.py,sha256=bm68_1QscJXa76sxVuAkcwdeGsvEZaQkrY3-3uUxrIg,6730
5
+ poma/integrations/__init__.py,sha256=xrrJluggTLtrKs4jLOZUWkFENqWSHSnhCqYQYY51kq0,405
6
+ poma/integrations/langchain_poma.py,sha256=IL3pWWGCEK_O0JagpnKPFRwKclyNTwPcaTTdKJkYfYY,14608
7
+ poma/integrations/llamaindex_poma.py,sha256=n3M71QXGVA2RTsUC24ZTt__VHEgsTbIW9BVwEn1Xxbg,14868
8
+ poma-0.1.0.dist-info/licenses/LICENSE,sha256=YRzZ4sQOWV3ut0G4LHZJ2hT90shzZufGlXoIx4LWFEo,15254
9
+ poma-0.1.0.dist-info/METADATA,sha256=_UxHkGjIOLccTV73nPZKJ6Qmv4zyM0yGA1qxrTsIy8U,3151
10
+ poma-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ poma-0.1.0.dist-info/top_level.txt,sha256=f_3c5Y6SojNnH0iiiE898fIKF6R2LqWyAw-BGi-72YI,5
12
+ poma-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+