poma 0.1.4__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
poma/client.py CHANGED
@@ -8,18 +8,15 @@ import time
8
8
  from pathlib import Path
9
9
  from typing import Any
10
10
 
11
- from poma.exceptions import AuthenticationError, InvalidInputError, RemoteServerError, InvalidResponseError
12
- from poma.retrieval import generate_cheatsheets, generate_single_cheatsheet
11
+ from poma.exceptions import (
12
+ AuthenticationError,
13
+ RemoteServerError,
14
+ InvalidResponseError,
15
+ )
16
+ from poma.retrieval import generate_cheatsheets
13
17
 
14
- USER_AGENT = "poma-ai-sdk/0.1.0"
15
18
 
16
- ALLOWED_FILE_EXTENSIONS: set[str] = {
17
- ".txt",
18
- ".md",
19
- ".html",
20
- ".htm",
21
- ".pdf",
22
- }
19
+ USER_AGENT = "poma-ai-sdk/0.1.0"
23
20
 
24
21
  API_BASE_URL = "https://api.poma-ai.com/api/v1"
25
22
 
@@ -51,10 +48,13 @@ class Poma:
51
48
  # Override API base URL if environment variable is set
52
49
  if os.environ.get("API_BASE_URL"):
53
50
  api_base_url = os.environ.get("API_BASE_URL")
51
+ if not api_base_url:
52
+ raise ValueError("API base URL cannot be empty.")
54
53
 
55
54
  self.base_api_url = api_base_url.rstrip("/")
56
- self._client = client or httpx.Client(timeout=timeout,
57
- headers={"user-agent": USER_AGENT})
55
+ self._client = client or httpx.Client(
56
+ timeout=timeout, headers={"user-agent": USER_AGENT}
57
+ )
58
58
  if not (api_key := api_key or os.environ.get("POMA_API_KEY", "")):
59
59
  raise Exception("POMA_API_KEY environment variable not set.")
60
60
  self._client.headers.update({"Authorization": f"Bearer {api_key}"})
@@ -77,11 +77,6 @@ class Poma:
77
77
  """
78
78
  if not file_path or not isinstance(file_path, os.PathLike):
79
79
  raise ValueError("file_path must be a non-empty os.PathLike.")
80
- file_extension = Path(file_path).suffix.lower()
81
- if file_extension not in ALLOWED_FILE_EXTENSIONS:
82
- raise InvalidInputError(
83
- f"File extension of {file_path} is not allowed; use one of the following types: {', '.join(sorted(ALLOWED_FILE_EXTENSIONS))}."
84
- )
85
80
  payload = {}
86
81
  if base_url:
87
82
  payload["base_url"] = base_url
@@ -97,12 +92,18 @@ class Poma:
97
92
  except httpx.HTTPStatusError as error:
98
93
  status = error.response.status_code
99
94
  if status in (401, 403):
100
- raise AuthenticationError(response.text) from error
101
- raise RemoteServerError(f"{status}: {response.text}") from error
95
+ raise AuthenticationError(
96
+ f"Failed to submit file '{file_path}': authentication error"
97
+ ) from error
98
+ raise RemoteServerError(
99
+ f"Failed to submit file '{file_path}': {status}"
100
+ ) from error
102
101
  try:
103
102
  data = response.json()
104
103
  except ValueError as error:
105
- raise InvalidResponseError("Server returned non-JSON or empty body") from error
104
+ raise InvalidResponseError(
105
+ "Server returned non-JSON or empty body"
106
+ ) from error
106
107
  return data
107
108
 
108
109
  def get_chunk_result(
@@ -149,19 +150,25 @@ class Poma:
149
150
  download = data.get("download", {})
150
151
  download_url = download.get("download_url", "")
151
152
  if not download_url:
152
- raise RuntimeError("Failed to receive download URL from server.")
153
+ raise RuntimeError(
154
+ "Failed to receive download URL from server."
155
+ )
153
156
 
154
157
  if download_dir is None:
155
158
  # Return bytes content instead of saving to file
156
159
  file_bytes = self.download_bytes(download_url)
157
- return self.extract_chunks_and_chunksets_from_poma_archive(poma_archive_data=file_bytes)
160
+ return self.extract_chunks_and_chunksets_from_poma_archive(
161
+ poma_archive_data=file_bytes
162
+ )
158
163
  else:
159
164
  # Save downloaded file to directory
160
165
  filename = download.get("filename", "downloaded_file.poma")
161
- downloaded_file_path = self.download_file(download_url,
162
- filename,
163
- save_directory=download_dir)
164
- return self.extract_chunks_and_chunksets_from_poma_archive(poma_archive_path=downloaded_file_path)
166
+ downloaded_file_path = self.download_file(
167
+ download_url, filename, save_directory=download_dir
168
+ )
169
+ return self.extract_chunks_and_chunksets_from_poma_archive(
170
+ poma_archive_path=downloaded_file_path
171
+ )
165
172
  elif status == "failed":
166
173
  error_code = data.get("code", "unknown")
167
174
  error_details = data.get("error", "No details provided.")
@@ -197,33 +204,29 @@ class Poma:
197
204
  Returns:
198
205
  dict: A dictionary containing the chunks and chunksets.
199
206
  """
200
- # Sanity check for parameters
201
- if not poma_archive_data and not poma_archive_path:
202
- raise ValueError("Either poma_archive_data or poma_archive_path must be provided.")
203
207
 
204
208
  # Load the chunks and chunksets from POMA archive
205
209
  chunks = None
206
210
  chunksets = None
207
211
  if poma_archive_path:
208
212
  with zipfile.ZipFile(poma_archive_path, "r") as zip_ref:
209
- chunks = zip_ref.read('chunks.json')
210
- chunksets = zip_ref.read('chunksets.json')
211
- else:
213
+ chunks = zip_ref.read("chunks.json")
214
+ chunksets = zip_ref.read("chunksets.json")
215
+ elif poma_archive_data:
212
216
  with zipfile.ZipFile(io.BytesIO(poma_archive_data), "r") as zip_ref:
213
- chunks = zip_ref.read('chunks.json')
214
- chunksets = zip_ref.read('chunksets.json')
217
+ chunks = zip_ref.read("chunks.json")
218
+ chunksets = zip_ref.read("chunksets.json")
219
+ else:
220
+ raise ValueError(
221
+ "Either poma_archive_data or poma_archive_path must be provided."
222
+ )
215
223
 
216
224
  # Sanity check
217
225
  if not chunks or not chunksets:
218
- raise KeyError(
219
- "Result must contain 'chunks' and 'chunksets' keys."
220
- )
226
+ raise KeyError("Result must contain 'chunks' and 'chunksets' keys.")
221
227
 
222
228
  # Load the chunks and chunksets
223
- json_result = {
224
- "chunks": json.loads(chunks),
225
- "chunksets": json.loads(chunksets)
226
- }
229
+ json_result = {"chunks": json.loads(chunks), "chunksets": json.loads(chunksets)}
227
230
  return json_result
228
231
 
229
232
  def create_cheatsheet(
@@ -234,14 +237,24 @@ class Poma:
234
237
  """
235
238
  Generates a single cheatsheet for one single document
236
239
  from relevant chunksets (relevant for a certain query)
237
- and from all available chunks (which must contain the textual content).
240
+ and from all chunks of that document (providing the textual content).
238
241
  Args:
239
242
  relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
240
- all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
243
+ all_chunks (list[dict]): A list of all chunk dictionaries of the same document, each representing a chunk of content.
241
244
  Returns:
242
245
  str: The textual content of the generated cheatsheet.
243
246
  """
244
- return generate_single_cheatsheet(relevant_chunksets, all_chunks)
247
+ cheatsheets = generate_cheatsheets(relevant_chunksets, all_chunks)
248
+ if (
249
+ not cheatsheets
250
+ or not isinstance(cheatsheets, list)
251
+ or len(cheatsheets) == 0
252
+ or "content" not in cheatsheets[0]
253
+ ):
254
+ raise Exception(
255
+ "Unknown error; cheatsheet could not be created from input chunks."
256
+ )
257
+ return cheatsheets[0]["content"]
245
258
 
246
259
  def create_cheatsheets(
247
260
  self,
@@ -250,14 +263,14 @@ class Poma:
250
263
  ) -> list[dict[str, Any]]:
251
264
  """
252
265
  Generates cheatsheets from relevant chunksets (relevant for a certain query)
253
- and from all available chunks (which must contain the textual content).
254
- One cheatsheet is created for each document tag found in the chunks.
266
+ and from all the chunks of all affected documents (providing the textual content).
267
+ One cheatsheet is created for each document found in the chunks (tagged with file_id).
255
268
  Args:
256
269
  relevant_chunksets (list[dict]): A list of chunksets, each containing a "chunks" key with a list of chunk IDs.
257
- all_chunks (list[dict]): A list of all available chunk dictionaries, each representing a chunk of content.
270
+ all_chunks (list[dict]): A list of all available chunk dictionaries of affected documents, each representing a chunk of content.
258
271
  Returns:
259
272
  list[dict]: A list of dictionaries representing the generated cheatsheets, each containing:
260
- - 'tag': The tag associated with the respective document.
273
+ - 'file_id': The tag associated with the respective document.
261
274
  - 'content': The textual content of the generated cheatsheet.
262
275
  """
263
276
  return generate_cheatsheets(relevant_chunksets, all_chunks)
@@ -283,7 +296,7 @@ class Poma:
283
296
  """
284
297
  if not download_url:
285
298
  raise ValueError("download_url cannot be empty")
286
-
299
+
287
300
  # Determine filename
288
301
  if not filename:
289
302
  filename = Path(download_url).name or "downloaded_file"
@@ -303,7 +316,8 @@ class Poma:
303
316
  # Save the file
304
317
  with open(save_path, "wb") as f:
305
318
  f.write(content)
306
- return save_path
319
+
320
+ return str(save_path)
307
321
 
308
322
  def download_bytes(
309
323
  self,
@@ -319,27 +333,27 @@ class Poma:
319
333
  """
320
334
  if not download_url:
321
335
  raise ValueError("download_url cannot be empty")
322
-
336
+
323
337
  # Construct the full URL if it's a relative path
324
338
  if download_url.startswith("/"):
325
339
  full_url = f"{self.base_api_url}{download_url}"
326
340
  else:
327
341
  full_url = download_url
328
-
342
+
329
343
  print("Downloading file from:", full_url)
330
344
  try:
331
- # Download the file
332
345
  response = self._client.get(full_url)
333
346
  response.raise_for_status()
334
-
335
- # Return the bytes content
336
347
  return response.content
337
-
338
348
  except httpx.HTTPStatusError as error:
339
349
  status = error.response.status_code
340
350
  if status in (401, 403):
341
- raise AuthenticationError(f"Authentication failed when downloading file: {response.text}") from error
342
- raise RemoteServerError(f"Failed to download file: {status} {response.text}") from error
351
+ raise AuthenticationError(
352
+ f"Failed to download '{download_url}': authentication error"
353
+ ) from error
354
+ raise RemoteServerError(
355
+ f"Failed to download '{download_url}': {status}"
356
+ ) from error
343
357
  except Exception as error:
344
358
  raise RuntimeError(f"File download failed: {error}") from error
345
359
 
poma/exceptions.py CHANGED
@@ -16,5 +16,6 @@ class RemoteServerError(PomaSDKError):
16
16
  class InvalidInputError(PomaSDKError):
17
17
  """Raised when an unsupported *Content‑Type* is given to ``chunk_text``."""
18
18
 
19
+
19
20
  class InvalidResponseError(PomaSDKError):
20
21
  """Raised when the server returns non-JSON or empty body."""
@@ -18,9 +18,8 @@ from langchain_text_splitters import TextSplitter
18
18
  from pydantic import Field, PrivateAttr
19
19
 
20
20
  from poma import Poma
21
- from poma.client import ALLOWED_FILE_EXTENSIONS
22
21
  from poma.exceptions import InvalidInputError
23
- from poma.retrieval import _cheatsheets_from_chunks
22
+ from poma.retrieval import chunks_from_dicts, _cheatsheets_from_chunks
24
23
 
25
24
  __all__ = ["PomaFileLoader", "PomaChunksetSplitter", "PomaCheatsheetRetrieverLC"]
26
25
 
@@ -52,10 +51,6 @@ class PomaFileLoader(BaseLoader):
52
51
  nonlocal skipped, documents
53
52
  if not file_path.is_file():
54
53
  return
55
- file_extension = file_path.suffix.lower()
56
- if not file_extension or file_extension not in ALLOWED_FILE_EXTENSIONS:
57
- skipped += 1
58
- return
59
54
  file_bytes = file_path.read_bytes()
60
55
  file_hash = hashlib.md5(file_bytes).hexdigest()
61
56
  if file_path.suffix.lower() == ".pdf":
@@ -84,13 +79,10 @@ class PomaFileLoader(BaseLoader):
84
79
  else:
85
80
  raise FileNotFoundError(f"Unsupported path type (not file/dir): {path}")
86
81
 
87
- allowed = ", ".join(sorted(ALLOWED_FILE_EXTENSIONS))
88
82
  if not documents:
89
- raise InvalidInputError(f"No supported files found. Allowed: {allowed}")
83
+ raise InvalidInputError(f"No supported files found.")
90
84
  if skipped > 0:
91
- print(
92
- f"Skipped {skipped} file(s) due to unsupported or unreadable type. Allowed: {allowed}"
93
- )
85
+ print(f"Skipped {skipped} file(s) due to unsupported or unreadable type.")
94
86
  return documents
95
87
 
96
88
 
@@ -330,7 +322,7 @@ class PomaCheatsheetRetrieverLC(BaseRetriever):
330
322
 
331
323
  def _create_cheatsheet_langchain(self, chunked_docs: list[Document]) -> str:
332
324
  """Generate a single deduplicated cheatsheet from chunked documents."""
333
- all_chunks = []
325
+ all_chunk_dicts = []
334
326
  seen = set()
335
327
  for doc in chunked_docs:
336
328
  doc_id = doc.metadata.get("doc_id", "unknown_doc")
@@ -341,11 +333,9 @@ class PomaCheatsheetRetrieverLC(BaseRetriever):
341
333
  if chunk_index not in seen:
342
334
  seen.add(chunk_index)
343
335
  chunk["tag"] = doc_id
344
- all_chunks.append(chunk)
345
- sorted_chunks = sorted(
346
- all_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
347
- )
348
- cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
336
+ all_chunk_dicts.append(chunk)
337
+ all_chunks = chunks_from_dicts(all_chunk_dicts)
338
+ cheatsheets = _cheatsheets_from_chunks(all_chunks)
349
339
  if (
350
340
  not cheatsheets
351
341
  or not isinstance(cheatsheets, list)
@@ -23,9 +23,8 @@ from llama_index.core.schema import (
23
23
  from pydantic import PrivateAttr
24
24
 
25
25
  from poma import Poma
26
- from poma.client import ALLOWED_FILE_EXTENSIONS
27
- from poma.retrieval import _cheatsheets_from_chunks
28
26
  from poma.exceptions import InvalidInputError
27
+ from poma.retrieval import chunks_from_dicts, _cheatsheets_from_chunks
29
28
 
30
29
  __all__ = ["PomaFileReader", "PomaChunksetNodeParser", "PomaCheatsheetRetrieverLI"]
31
30
 
@@ -54,9 +53,6 @@ class PomaFileReader(BaseReader):
54
53
  if not file_path.is_file():
55
54
  return
56
55
  file_extension = file_path.suffix.lower()
57
- if not file_extension or file_extension not in ALLOWED_FILE_EXTENSIONS:
58
- skipped += 1
59
- return
60
56
  file_bytes = file_path.read_bytes()
61
57
  file_hash = hashlib.md5(file_bytes).hexdigest()
62
58
  if file_extension == ".pdf":
@@ -87,13 +83,10 @@ class PomaFileReader(BaseReader):
87
83
  else:
88
84
  raise FileNotFoundError(f"Unsupported path type (not file/dir): {path}")
89
85
 
90
- allowed = ", ".join(sorted(ALLOWED_FILE_EXTENSIONS))
91
86
  if not documents:
92
- raise InvalidInputError(f"No supported files found. Allowed: {allowed}")
87
+ raise InvalidInputError(f"No supported files found.")
93
88
  if skipped > 0:
94
- print(
95
- f"Skipped {skipped} file(s) due to unsupported or unreadable type. Allowed: {allowed}"
96
- )
89
+ print(f"Skipped {skipped} file(s) due to unsupported or unreadable type.")
97
90
  return documents
98
91
 
99
92
 
@@ -329,7 +322,7 @@ class PomaCheatsheetRetrieverLI(BaseRetriever):
329
322
 
330
323
  def _create_cheatsheet_llamaindex(self, chunked_nodes: list[NodeWithScore]) -> str:
331
324
  """Generate a single deduplicated cheatsheet from chunked nodes."""
332
- all_chunks = []
325
+ all_chunk_dicts = []
333
326
  seen = set()
334
327
  for node in chunked_nodes:
335
328
  doc_id = node.metadata.get("doc_id", "unknown_doc")
@@ -344,11 +337,9 @@ class PomaCheatsheetRetrieverLI(BaseRetriever):
344
337
  continue
345
338
  seen.add(chunk_index)
346
339
  chunk["tag"] = doc_id
347
- all_chunks.append(chunk)
348
- sorted_chunks = sorted(
349
- all_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
350
- )
351
- cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
340
+ all_chunk_dicts.append(chunk)
341
+ all_chunks = chunks_from_dicts(all_chunk_dicts)
342
+ cheatsheets = _cheatsheets_from_chunks(all_chunks)
352
343
  if (
353
344
  not cheatsheets
354
345
  or not isinstance(cheatsheets, list)
poma/retrieval.py CHANGED
@@ -2,62 +2,206 @@
2
2
  from collections import defaultdict
3
3
  from itertools import chain
4
4
  from typing import Any
5
+ import warnings
6
+
7
+
8
+ def deprecated(replacement: str):
9
+ def decorator(func):
10
+ msg = (
11
+ f"{func.__name__}() is deprecated and will be removed in a future version. "
12
+ f"Use {replacement} instead."
13
+ )
14
+
15
+ def wrapper(*args, **kwargs):
16
+ warnings.warn(msg, DeprecationWarning, stacklevel=2)
17
+ return func(*args, **kwargs)
18
+
19
+ wrapper.__name__ = func.__name__
20
+ wrapper.__doc__ = (func.__doc__ or "") + f"\n\nDEPRECATED: {msg}\n"
21
+ return wrapper
22
+
23
+ return decorator
5
24
 
6
25
 
7
26
  def generate_cheatsheets(
8
- relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
27
+ relevant_chunksets: list[dict[str, Any]],
28
+ all_chunks: list[dict[str, Any]],
9
29
  ) -> list[dict[str, Any]]:
10
- chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
11
- chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
12
- relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
13
- sorted_chunks = sorted(
14
- relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
15
- )
16
- return _cheatsheets_from_chunks(sorted_chunks)
30
+ # get chunks grouped by document file_id
31
+ doc_chunks = defaultdict(list)
32
+ for chunk in all_chunks:
33
+ file_id = chunk.get("file_id") or chunk.get("tag") or "single_doc"
34
+ chunk["file_id"] = file_id # update
35
+ doc_chunks[file_id].append(chunk)
17
36
 
37
+ # Check for duplicate chunk_index values
38
+ # (necessary when file_id was not set in chunks)
39
+ for file_id, chunks in doc_chunks.items():
40
+ chunk_indices = [c["chunk_index"] for c in chunks]
41
+ if len(chunk_indices) != len(set(chunk_indices)):
42
+ raise ValueError(f"Duplicate chunk_index found for file_id: {file_id}")
18
43
 
19
- def generate_single_cheatsheet(
20
- relevant_chunksets: list[dict[str, Any]], all_chunks: list[dict[str, Any]]
21
- ) -> str:
44
+ # get relevant chunksets grouped by document file_id
45
+ relevant_chunksets_per_doc = defaultdict(list)
46
+ for chunkset in relevant_chunksets:
47
+ file_id = chunkset.get("file_id") or chunkset.get("tag") or "single_doc"
48
+ chunkset["file_id"] = file_id # update
49
+ if "chunks" not in chunkset:
50
+ raise ValueError(
51
+ "Chunkset not valid; must contain a 'chunks' key with a list of chunk IDs."
52
+ )
53
+ relevant_chunksets_per_doc[file_id].append(chunkset)
22
54
 
23
- def prepare_single_doc_chunks(
24
- chunk_dicts: list[dict[str, Any]],
25
- ) -> list[dict[str, Any]]:
26
- # Make sure there are no duplicate chunk_index values
27
- check_dict = defaultdict(set)
28
- has_duplicates = any(
29
- chunk["chunk_index"] in check_dict[chunk["tag"]]
30
- or check_dict[chunk["tag"]].add(chunk["chunk_index"])
31
- for chunk in chunk_dicts
32
- )
33
- if has_duplicates:
55
+ # Ensure that chunksets and chunks correspond to the same file_ids
56
+ for file_id in relevant_chunksets_per_doc.keys():
57
+ if file_id not in doc_chunks:
34
58
  raise ValueError(
35
- "Duplicate chunk indices found in single document mode. "
36
- "Each chunk must have a unique index."
59
+ f"Chunksets contain file_id '{file_id}' which is not present in the chunks."
37
60
  )
38
- # Use a fixed tag for chunks from single documents
39
- for chunk_dict in chunk_dicts:
40
- chunk_dict["tag"] = "single_doc"
41
- return chunk_dicts
42
-
43
- chunk_ids = [cs["chunks"] for cs in relevant_chunksets if "chunks" in cs]
44
- chunk_ids = list(chain.from_iterable(chunk_ids)) # flatten the list
45
- relevant_chunks = _get_relevant_chunks_for_ids(chunk_ids, all_chunks)
46
- relevant_chunks = prepare_single_doc_chunks(relevant_chunks)
47
- sorted_chunks = sorted(
48
- relevant_chunks, key=lambda chunk: (chunk["tag"], chunk["chunk_index"])
61
+
62
+ # retrieve relevant chunks with content per document
63
+ relevant_content_chunks: list[RetrievalChunk] = []
64
+ for file_id, chunksets_per_doc in relevant_chunksets_per_doc.items():
65
+ chunk_ids = list( # flattened list
66
+ chain.from_iterable(chunkset["chunks"] for chunkset in chunksets_per_doc)
67
+ )
68
+ relevant_chunks_dict = _get_relevant_chunks_for_ids(
69
+ chunk_ids, doc_chunks[file_id]
70
+ )
71
+ relevant_chunks: list[RetrievalChunk] = chunks_from_dicts(relevant_chunks_dict)
72
+ relevant_content_chunks.extend(relevant_chunks)
73
+
74
+ return _cheatsheets_from_chunks(relevant_content_chunks)
75
+
76
+
77
+ @deprecated("generate_cheatsheets(relevant_chunksets, all_chunks)")
78
+ def generate_single_cheatsheet(
79
+ relevant_chunksets: list[dict[str, Any]],
80
+ all_chunks: list[dict[str, Any]],
81
+ ) -> str:
82
+ cheatsheets = generate_cheatsheets(
83
+ relevant_chunksets=relevant_chunksets,
84
+ all_chunks=all_chunks,
49
85
  )
50
- cheatsheets = _cheatsheets_from_chunks(sorted_chunks)
51
- if (
52
- not cheatsheets
53
- or not isinstance(cheatsheets, list)
54
- or len(cheatsheets) == 0
55
- or "content" not in cheatsheets[0]
86
+ return cheatsheets[0].get("content", "") if cheatsheets else ""
87
+
88
+
89
+ ########################
90
+ # RetrievalChunk Class #
91
+ ########################
92
+
93
+
94
+ class RetrievalChunk:
95
+ """
96
+ Represents a chunk of text with associated metadata.
97
+ Attributes:
98
+ index (int): The index of the chunk within a sequence.
99
+ file_id (str): The id associating the chunk with a document.
100
+ content (str): The textual content of the chunk.
101
+ depth_rebased (int, optional): The hierarchical depth of the chunk content.
102
+ In cheatsheets, this affects indentation for certain text parts.
103
+ Currently only used for code blocks.
104
+ """
105
+
106
+ def __init__(
107
+ self,
108
+ index: int,
109
+ file_id: str,
110
+ content: str,
111
+ depth_rebased: int | None,
112
+ ):
113
+ self.index = index
114
+ self.file_id = file_id
115
+ self.content = content
116
+ self.depth_rebased = depth_rebased
117
+
118
+ @classmethod
119
+ def from_chunk_dict(
120
+ cls,
121
+ chunk_dict: dict,
122
+ block_min_depth: int | None,
56
123
  ):
57
- raise Exception(
58
- "Unknown error; cheatsheet could not be created from input chunks."
124
+ if block_min_depth is not None:
125
+ depth = int(chunk_dict["depth"])
126
+ depth_rebased = cls._rebase_depth(depth, block_min_depth)
127
+ else:
128
+ depth_rebased = None
129
+ return cls(
130
+ index=int(chunk_dict["chunk_index"]),
131
+ file_id=str(
132
+ chunk_dict.get("file_id") or chunk_dict.get("tag") or "single_doc"
133
+ ),
134
+ content=str(chunk_dict["content"]),
135
+ depth_rebased=depth_rebased,
136
+ )
137
+
138
+ @staticmethod
139
+ def _rebase_depth(depth: int, min_depth: int, base_unit: int = 0) -> int | None:
140
+ rebased = depth - min_depth + base_unit
141
+ return max(0, rebased)
142
+
143
+ def __repr__(self):
144
+ return f"RetrievalChunk(index={self.index}, file_id={self.file_id}, content={self.content}), depth_rebased={self.depth_rebased}"
145
+
146
+
147
+ def chunks_from_dicts(chunk_dicts: list[dict]) -> list[RetrievalChunk]:
148
+ """
149
+ Converts a list of chunk dictionaries into a list of Chunk objects.
150
+ File_ids are needed to identify chunks from different documents;
151
+ if is_single_doc is True, all chunks are assumed to come from a single document
152
+ and file_id is optional.
153
+ Args:
154
+ chunk_dicts (list[dict]): A list of dictionaries, each representing a chunk with required keys:
155
+ - "chunk_index": The index of the chunk within the document.
156
+ - "file_id": The identifier of the document.
157
+ - "content": The textual content of the chunk.
158
+ - "depth": The depth or level of the chunk.
159
+ Returns:
160
+ list[Chunk]: A list of Chunk objects with the textual content needed for the cheatsheets.
161
+ """
162
+
163
+ # Determine the minimum depth per code block
164
+ min_depth_per_code_block: dict[str, int] = {}
165
+ for chunk_dict in chunk_dicts:
166
+ block_id = chunk_dict.get("code")
167
+ if block_id is None:
168
+ continue
169
+ depth = int(chunk_dict["depth"])
170
+ current = min_depth_per_code_block.get(block_id)
171
+ min_depth_per_code_block[block_id] = (
172
+ depth if current is None else min(current, depth)
173
+ )
174
+
175
+ # Create Chunk objects
176
+ all_chunks: list[RetrievalChunk] = []
177
+ for chunk_dict in chunk_dicts:
178
+ code_id = chunk_dict.get("code")
179
+ if bool(code_id):
180
+ block_min_depth = min_depth_per_code_block.get(str(code_id))
181
+ else:
182
+ block_min_depth = None
183
+ chunk = RetrievalChunk.from_chunk_dict(chunk_dict, block_min_depth)
184
+ all_chunks.append(chunk)
185
+
186
+ # Sanity check: Make sure there are no duplicate chunk_index values
187
+ check_dict = defaultdict(set)
188
+ has_duplicates = any(
189
+ chunk.index in check_dict[chunk.file_id]
190
+ or check_dict[chunk.file_id].add(chunk.index)
191
+ for chunk in all_chunks
192
+ )
193
+ if has_duplicates:
194
+ raise ValueError(
195
+ "Duplicate chunk indices found in single document mode. "
196
+ "Each chunk must have a unique index."
59
197
  )
60
- return cheatsheets[0]["content"]
198
+
199
+ return all_chunks
200
+
201
+
202
+ ###################
203
+ # Private Methods #
204
+ ###################
61
205
 
62
206
 
63
207
  def _get_relevant_chunks_for_ids(
@@ -138,39 +282,58 @@ def _get_relevant_chunks_for_ids(
138
282
 
139
283
 
140
284
  def _cheatsheets_from_chunks(
141
- content_chunks: list[dict[str, Any]],
285
+ content_chunks: list[RetrievalChunk],
142
286
  ) -> list[dict[str, Any]]:
287
+ if not content_chunks:
288
+ return []
289
+
290
+ if isinstance(content_chunks[0], dict):
291
+ raise ValueError(
292
+ "Input to _cheatsheets_from_chunks must be a list of RetrievalChunk objects, not dicts."
293
+ "Use chunks_from_dicts() to convert dicts to RetrievalChunk objects first."
294
+ )
295
+
296
+ def _format_chunk_content(chunk: "RetrievalChunk") -> str:
297
+ if not getattr(chunk, "depth_rebased", False):
298
+ return chunk.content
299
+ else:
300
+ indent = " " * 4 * (chunk.depth_rebased or 0)
301
+ return f"{indent}{chunk.content}"
302
+
143
303
  cheatsheets: list[dict] = []
144
304
 
145
305
  compressed_data = {}
306
+ content_chunks = sorted(content_chunks, key=lambda c: (c.file_id, c.index))
146
307
  for chunk in content_chunks:
147
- if chunk["tag"] not in compressed_data:
148
- # If there is data stored for a previous tag, save it to the cheatsheets list
308
+ if chunk.file_id not in compressed_data:
309
+ # If there is data stored for a previous file_id, save it to the cheatsheets list
149
310
  if compressed_data:
150
311
  for key, value in compressed_data.items():
151
- cheatsheets.append({"tag": key, "content": value["content"]})
152
- # Clear the compressed_data for the current tag
312
+ cheatsheets.append(
313
+ {"file_id": key, "tag": key, "content": value["content"]}
314
+ )
315
+ # Clear the compressed_data for the current file_id
153
316
  compressed_data.clear()
154
- # Start a new entry for the current tag
155
- compressed_data[chunk["tag"]] = {
156
- "content": chunk["content"],
157
- "last_chunk": chunk["chunk_index"],
317
+ # Start a new entry for the current file_id
318
+ compressed_data[chunk.file_id] = {
319
+ "content": _format_chunk_content(chunk),
320
+ "last_chunk": chunk.index,
158
321
  }
159
322
  else:
323
+ chunk_content = _format_chunk_content(chunk)
160
324
  # Check if chunks are consecutive
161
- if (
162
- chunk["chunk_index"]
163
- == int(compressed_data[chunk["tag"]]["last_chunk"]) + 1
164
- ):
165
- compressed_data[chunk["tag"]]["content"] += "\n" + chunk["content"]
325
+ if chunk.index == int(compressed_data[chunk.file_id]["last_chunk"]) + 1:
326
+ compressed_data[chunk.file_id]["content"] += "\n" + chunk_content
166
327
  else:
167
- compressed_data[chunk["tag"]]["content"] += "\n[…]\n" + chunk["content"]
328
+ compressed_data[chunk.file_id]["content"] += "\n[…]\n" + chunk_content
168
329
  # Update the last chunk index
169
- compressed_data[chunk["tag"]]["last_chunk"] = chunk["chunk_index"]
330
+ compressed_data[chunk.file_id]["last_chunk"] = chunk.index
170
331
 
171
332
  # Save the last processed entry to the cheatsheets list
172
333
  if compressed_data:
173
334
  for key, value in compressed_data.items():
174
- cheatsheets.append({"tag": key, "content": value["content"]})
335
+ cheatsheets.append(
336
+ {"file_id": key, "tag": key, "content": value["content"]}
337
+ )
175
338
 
176
339
  return cheatsheets
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: poma
3
- Version: 0.1.4
3
+ Version: 0.2.1
4
4
  Summary: Official Python SDK for the Poma document-processing API
5
5
  Author-email: "POMA AI GmbH, Berlin" <sdk@poma-ai.com>
6
6
  License-Expression: MPL-2.0
@@ -40,15 +40,15 @@ pip install poma
40
40
 
41
41
  For integrations into LangChain and LlamaIndex:
42
42
  ```bash
43
- pip install poma[integrations]
43
+ pip install 'poma[integrations]'
44
44
  # Or LangChain/LlamaIndex including example extras:
45
- pip install poma[integration-examples]
45
+ pip install 'poma[integration-examples]'
46
46
  ```
47
47
 
48
48
 
49
49
  - You may also want: `pip install python-dotenv` to load API keys from a .env file.
50
50
  - API keys required (POMA_API_KEY) for the POMA AI client via environment variables.
51
- - **To request a POMA_API_KEY, please contact us at api@poma-ai.com**
51
+ - **To request a POMA_API_KEY, please contact us at sdk@poma-ai.com**
52
52
 
53
53
 
54
54
  ### Example Implementations — all examples, integrations, and additional information can be found in our GitHub repository: [poma-ai/poma](https://github.com/poma-ai/)
@@ -0,0 +1,12 @@
1
+ poma/__init__.py,sha256=SARVBTJw2pkIXR2_OYMPYjB7W335er_2-9j4yhzVTZI,266
2
+ poma/client.py,sha256=sAaW7lz6s6WQDhkjMYfhOtd_d8baVflToc7q5l9BTYo,14337
3
+ poma/exceptions.py,sha256=CsawBf0pwtZvm_0kPxLQ5WsIU_vCzNoMEMF_o5OJkBQ,519
4
+ poma/retrieval.py,sha256=KcKa1pKjgWVieHflcPC6oiZO8ZZECHEpzqVlHC2RPDk,12475
5
+ poma/integrations/__init__.py,sha256=xrrJluggTLtrKs4jLOZUWkFENqWSHSnhCqYQYY51kq0,405
6
+ poma/integrations/langchain_poma.py,sha256=wDVrPJde7gRJR4ml3wS4qiL_dM4w2RDKVZYM_FrYgHM,14202
7
+ poma/integrations/llamaindex_poma.py,sha256=biSAwcbKShsTdnjtCtM63GCzD5Nig4-85ZBquB_CG8w,14516
8
+ poma-0.2.1.dist-info/licenses/LICENSE,sha256=YRzZ4sQOWV3ut0G4LHZJ2hT90shzZufGlXoIx4LWFEo,15254
9
+ poma-0.2.1.dist-info/METADATA,sha256=vVhHvee7kq5DV4Yu7iCjk_abu8i3vW-I6PIoaDq_J5w,3083
10
+ poma-0.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
+ poma-0.2.1.dist-info/top_level.txt,sha256=f_3c5Y6SojNnH0iiiE898fIKF6R2LqWyAw-BGi-72YI,5
12
+ poma-0.2.1.dist-info/RECORD,,
@@ -1,12 +0,0 @@
1
- poma/__init__.py,sha256=SARVBTJw2pkIXR2_OYMPYjB7W335er_2-9j4yhzVTZI,266
2
- poma/client.py,sha256=HOlNVfSPYzq1RjLVUtIqbObH5QSLRbi7KnbCnBjpUh4,14201
3
- poma/exceptions.py,sha256=5d8SdIIRFotKUJJAy9mct2q44oEmAsR15OVEmkLDfkQ,518
4
- poma/retrieval.py,sha256=bm68_1QscJXa76sxVuAkcwdeGsvEZaQkrY3-3uUxrIg,6730
5
- poma/integrations/__init__.py,sha256=xrrJluggTLtrKs4jLOZUWkFENqWSHSnhCqYQYY51kq0,405
6
- poma/integrations/langchain_poma.py,sha256=IL3pWWGCEK_O0JagpnKPFRwKclyNTwPcaTTdKJkYfYY,14608
7
- poma/integrations/llamaindex_poma.py,sha256=n3M71QXGVA2RTsUC24ZTt__VHEgsTbIW9BVwEn1Xxbg,14868
8
- poma-0.1.4.dist-info/licenses/LICENSE,sha256=YRzZ4sQOWV3ut0G4LHZJ2hT90shzZufGlXoIx4LWFEo,15254
9
- poma-0.1.4.dist-info/METADATA,sha256=LBg2pxFEr3zGgMahJS0kQSKAwS003fvMLQfHLpiZsX0,3079
10
- poma-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
11
- poma-0.1.4.dist-info/top_level.txt,sha256=f_3c5Y6SojNnH0iiiE898fIKF6R2LqWyAw-BGi-72YI,5
12
- poma-0.1.4.dist-info/RECORD,,
File without changes