chatterer 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. chatterer/__init__.py +62 -60
  2. chatterer/common_types/__init__.py +21 -0
  3. chatterer/common_types/io.py +19 -0
  4. chatterer/language_model.py +577 -577
  5. chatterer/messages.py +9 -9
  6. chatterer/strategies/__init__.py +13 -13
  7. chatterer/strategies/atom_of_thoughts.py +975 -975
  8. chatterer/strategies/base.py +14 -14
  9. chatterer/tools/__init__.py +35 -28
  10. chatterer/tools/citation_chunking/__init__.py +3 -3
  11. chatterer/tools/citation_chunking/chunks.py +53 -53
  12. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  13. chatterer/tools/citation_chunking/citations.py +285 -285
  14. chatterer/tools/citation_chunking/prompt.py +157 -157
  15. chatterer/tools/citation_chunking/reference.py +26 -26
  16. chatterer/tools/citation_chunking/utils.py +138 -138
  17. chatterer/tools/convert_to_text.py +418 -463
  18. chatterer/tools/upstage_document_parser.py +438 -0
  19. chatterer/tools/webpage_to_markdown/__init__.py +4 -4
  20. chatterer/tools/webpage_to_markdown/playwright_bot.py +649 -649
  21. chatterer/tools/webpage_to_markdown/utils.py +334 -334
  22. chatterer/tools/youtube.py +146 -146
  23. chatterer/utils/__init__.py +15 -15
  24. chatterer/utils/bytesio.py +59 -0
  25. chatterer/utils/code_agent.py +138 -138
  26. chatterer/utils/image.py +291 -291
  27. {chatterer-0.1.12.dist-info → chatterer-0.1.13.dist-info}/METADATA +171 -170
  28. chatterer-0.1.13.dist-info/RECORD +31 -0
  29. chatterer-0.1.12.dist-info/RECORD +0 -27
  30. {chatterer-0.1.12.dist-info → chatterer-0.1.13.dist-info}/WHEEL +0 -0
  31. {chatterer-0.1.12.dist-info → chatterer-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,438 @@
1
+ """Adopted from`langchain_upstage.document_parse"""
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ from typing import Iterator, Literal, Optional, cast
8
+
9
+ import requests
10
+ from langchain_core.document_loaders import BaseBlobParser, Blob
11
+ from langchain_core.documents import Document
12
+ from pydantic import BaseModel, Field
13
+ from pypdf import PdfReader, PdfWriter
14
+ from pypdf.errors import PdfReadError
15
+
16
+ from ..common_types.io import BytesReadable
17
+ from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
18
+ from ..utils.image import Base64Image
19
+
20
+ logger = logging.getLogger("pypdf")
21
+ logger.setLevel(logging.ERROR)
22
+
23
+ DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
24
+ DEFAULT_NUM_PAGES = 10
25
+ DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
26
+
27
+ OutputFormat = Literal["text", "html", "markdown"]
28
+ OCR = Literal["auto", "force"]
29
+ SplitType = Literal["none", "page", "element"]
30
+ Category = Literal[
31
+ "paragraph",
32
+ "table",
33
+ "figure",
34
+ "header",
35
+ "footer",
36
+ "caption",
37
+ "equation",
38
+ "heading1",
39
+ "list",
40
+ "index",
41
+ "footnote",
42
+ "chart",
43
+ ]
44
+
45
+
46
+ class Content(BaseModel):
47
+ text: Optional[str] = None
48
+ html: Optional[str] = None
49
+ markdown: Optional[str] = None
50
+
51
+
52
+ class Coordinate(BaseModel):
53
+ x: float
54
+ y: float
55
+
56
+
57
+ class Element(BaseModel):
58
+ category: Category
59
+ content: Content
60
+ coordinates: list[Coordinate] = Field(default_factory=list)
61
+ base64_encoding: str = ""
62
+ id: int
63
+ page: int
64
+
65
+ def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
66
+ output_format: OutputFormat = parser.output_format
67
+ chatterer: Optional[Chatterer] = parser.chatterer
68
+ image_description_instruction: str = parser.image_description_instruction
69
+ output: Optional[str] = None
70
+ if output_format == "text":
71
+ output = self.content.text
72
+ elif output_format == "html":
73
+ output = self.content.html
74
+ elif output_format == "markdown":
75
+ output = self.content.markdown
76
+ if output is None:
77
+ raise ValueError(f"Invalid output format: {output_format}")
78
+
79
+ if chatterer is not None and self.category == "figure" and self.base64_encoding:
80
+ image = Base64Image.from_string(f"data:image/jpeg;base64,{self.base64_encoding}")
81
+ if image is None:
82
+ raise ValueError(f"Invalid base64 encoding for image: {self.base64_encoding}")
83
+ ocr_content = output.removeprefix("![image](/image/placeholder)\n")
84
+ image_description = chatterer.describe_image(
85
+ image.data_uri,
86
+ image_description_instruction
87
+ + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
88
+ )
89
+ output = f"\n\n<details>\n{image_description}\n</details>\n\n"
90
+
91
+ return output
92
+
93
+
94
+ def get_from_param_or_env(
95
+ key: str,
96
+ param: Optional[str] = None,
97
+ env_key: Optional[str] = None,
98
+ default: Optional[str] = None,
99
+ ) -> str:
100
+ """Get a value from a param or an environment variable."""
101
+ if param is not None:
102
+ return param
103
+ elif env_key and env_key in os.environ and os.environ[env_key]:
104
+ return os.environ[env_key]
105
+ elif default is not None:
106
+ return default
107
+ else:
108
+ raise ValueError(
109
+ f"Did not find {key}, please add an environment variable"
110
+ f" `{env_key}` which contains it, or pass"
111
+ f" `{key}` as a named parameter."
112
+ )
113
+
114
+
115
+ class UpstageDocumentParseParser(BaseBlobParser):
116
+ """Upstage Document Parse Parser.
117
+
118
+ To use, you should have the environment variable `UPSTAGE_API_KEY`
119
+ set with your API key or pass it as a named parameter to the constructor.
120
+
121
+ Example:
122
+ .. code-block:: python
123
+
124
+ from langchain_upstage import UpstageDocumentParseParser
125
+
126
+ loader = UpstageDocumentParseParser(split="page", output_format="text")
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ api_key: Optional[str] = None,
132
+ base_url: str = DOCUMENT_PARSE_BASE_URL,
133
+ model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
134
+ split: SplitType = "none",
135
+ ocr: OCR = "auto",
136
+ output_format: OutputFormat = "markdown",
137
+ coordinates: bool = True,
138
+ base64_encoding: list[Category] = [],
139
+ chatterer: Optional[Chatterer] = None,
140
+ image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
141
+ ) -> None:
142
+ """
143
+ Initializes an instance of the Upstage class.
144
+
145
+ Args:
146
+ api_key (str, optional): The API key for accessing the Upstage API.
147
+ Defaults to None, in which case it will be
148
+ fetched from the environment variable
149
+ `UPSTAGE_API_KEY`.
150
+ base_url (str, optional): The base URL for accessing the Upstage API.
151
+ model (str): The model to be used for the document parse.
152
+ Defaults to "document-parse".
153
+ split (SplitType, optional): The type of splitting to be applied.
154
+ Defaults to "none" (no splitting).
155
+ ocr (OCRMode, optional): Extract text from images in the document using OCR.
156
+ If the value is "force", OCR is used to extract
157
+ text from an image. If the value is "auto", text is
158
+ extracted from a PDF. (An error will occur if the
159
+ value is "auto" and the input is NOT in PDF format)
160
+ output_format (OutputFormat, optional): Format of the inference results.
161
+ coordinates (bool, optional): Whether to include the coordinates of the
162
+ OCR in the output.
163
+ base64_encoding (List[Category], optional): The category of the elements to
164
+ be encoded in base64.
165
+ chatterer (Chatterer, optional): The Chatterer instance to use for image
166
+ description.
167
+ image_description_instruction (str, optional): The instruction to use for
168
+ image description.
169
+
170
+
171
+ """
172
+ self.api_key = get_from_param_or_env(
173
+ "UPSTAGE_API_KEY",
174
+ api_key,
175
+ "UPSTAGE_API_KEY",
176
+ os.environ.get("UPSTAGE_API_KEY"),
177
+ )
178
+ self.base_url = base_url
179
+ self.model = model
180
+ self.split: SplitType = split
181
+ self.ocr: OCR = ocr
182
+ self.output_format: OutputFormat = output_format
183
+ self.coordinates = coordinates
184
+ self.base64_encoding: list[Category] = base64_encoding
185
+ self.chatterer = chatterer
186
+ self.image_description_instruction = image_description_instruction
187
+
188
+ def _get_response(self, files: dict[str, BytesReadable]) -> list[Element]:
189
+ """
190
+ Sends a POST request to the API endpoint with the provided files and
191
+ returns the response.
192
+
193
+ Args:
194
+ files (dict): A dictionary containing the files to be sent in the request.
195
+
196
+ Returns:
197
+ dict: The JSON response from the API.
198
+
199
+ Raises:
200
+ ValueError: If there is an error in the API call.
201
+ """
202
+ try:
203
+ headers = {
204
+ "Authorization": f"Bearer {self.api_key}",
205
+ }
206
+ response = requests.post(
207
+ self.base_url,
208
+ headers=headers,
209
+ files=files,
210
+ data={
211
+ "ocr": self.ocr,
212
+ "model": self.model,
213
+ "output_formats": f"['{self.output_format}']",
214
+ "coordinates": self.coordinates,
215
+ "base64_encoding": f"{self.base64_encoding}",
216
+ },
217
+ )
218
+ response.raise_for_status()
219
+ result: object = response.json().get("elements", [])
220
+ if not isinstance(result, list):
221
+ raise ValueError(f"Failed to parse JSON data: {result}")
222
+ result = cast(list[object], result)
223
+ return [Element.model_validate(element) for element in result]
224
+ except requests.HTTPError as e:
225
+ raise ValueError(f"HTTP error: {e.response.text}")
226
+ except requests.RequestException as e:
227
+ # Handle any request-related exceptions
228
+ raise ValueError(f"Failed to send request: {e}")
229
+ except json.JSONDecodeError as e:
230
+ # Handle JSON decode errors
231
+ raise ValueError(f"Failed to decode JSON response: {e}")
232
+ except Exception as e:
233
+ # Handle any other exceptions
234
+ raise ValueError(f"An error occurred: {e}")
235
+
236
+ def _split_and_request(
237
+ self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
238
+ ) -> list[Element]:
239
+ """
240
+ Splits the full pdf document into partial pages and sends a request to the
241
+ server.
242
+
243
+ Args:
244
+ full_docs (PdfReader): The full document to be split and requested.
245
+ start_page (int): The starting page number for splitting the document.
246
+ num_pages (int, optional): The number of pages to split the document
247
+ into.
248
+ Defaults to DEFAULT_NUMBER_OF_PAGE.
249
+
250
+ Returns:
251
+ response: The response from the server.
252
+ """
253
+ merger = PdfWriter()
254
+ merger.append(
255
+ full_docs,
256
+ pages=(start_page, min(start_page + num_pages, full_docs.get_num_pages())),
257
+ )
258
+
259
+ with io.BytesIO() as buffer:
260
+ merger.write(buffer)
261
+ buffer.seek(0)
262
+ return self._get_response({"document": buffer})
263
+
264
+ def _element_document(self, element: Element, start_page: int = 0) -> Document:
265
+ """
266
+ Converts an elements into a Document object.
267
+
268
+ Args:
269
+ elements (Dict) : The elements to convert.
270
+ start_page (int): The starting page number for splitting the document.
271
+ This number starts from zero.
272
+
273
+ Returns:
274
+ A list containing a single Document object.
275
+
276
+ """
277
+ metadata: dict[str, object] = element.model_dump(exclude_none=True)
278
+ metadata["page"] = element.page + start_page
279
+ return Document(
280
+ page_content=element.parse_text(self),
281
+ metadata=metadata,
282
+ )
283
+
284
+ def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
285
+ """
286
+ Combines elements with the same page number into a single Document object.
287
+
288
+ Args:
289
+ elements (List): A list of elements containing page numbers.
290
+ start_page (int): The starting page number for splitting the document.
291
+ This number starts from zero.
292
+
293
+ Returns:
294
+ List[Document]: A list of Document objects, each representing a page
295
+ with its content and metadata.
296
+ """
297
+ documents: list[Document] = []
298
+ pages: list[int] = sorted(set(map(lambda x: x.page, elements)))
299
+ page_group: list[list[Element]] = [[element for element in elements if element.page == x] for x in pages]
300
+ for group in page_group:
301
+ metadata: dict[str, object] = {
302
+ "page": group[0].page + start_page,
303
+ }
304
+ if self.base64_encoding:
305
+ metadata["base64_encodings"] = [element.base64_encoding for element in group if element.base64_encoding]
306
+ if self.coordinates:
307
+ metadata["coordinates"] = [element.coordinates for element in group if element.coordinates]
308
+ documents.append(
309
+ Document(
310
+ page_content=" ".join(element.parse_text(self) for element in group),
311
+ metadata=metadata,
312
+ )
313
+ )
314
+
315
+ return documents
316
+
317
+ def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
318
+ """
319
+ Lazily parses a document and yields Document objects based on the specified
320
+ split type.
321
+
322
+ Args:
323
+ blob (Blob): The input document blob to parse.
324
+ is_batch (bool, optional): Whether to parse the document in batches.
325
+ Defaults to False (single page parsing)
326
+
327
+ Yields:
328
+ Document: The parsed document object.
329
+
330
+ Raises:
331
+ ValueError: If an invalid split type is provided.
332
+
333
+ """
334
+
335
+ if is_batch:
336
+ num_pages = DEFAULT_NUM_PAGES
337
+ else:
338
+ num_pages = 1
339
+
340
+ full_docs: Optional[PdfReader] = None
341
+ try:
342
+ full_docs = PdfReader(str(blob.path))
343
+ number_of_pages = full_docs.get_num_pages()
344
+ except PdfReadError:
345
+ number_of_pages = 1
346
+ except Exception as e:
347
+ raise ValueError(f"Failed to read PDF file: {e}")
348
+
349
+ if self.split == "none":
350
+ result = ""
351
+ base64_encodings: list[str] = []
352
+ coordinates: list[list[Coordinate]] = []
353
+
354
+ if full_docs is not None:
355
+ start_page = 0
356
+ num_pages = DEFAULT_NUM_PAGES
357
+ for _ in range(number_of_pages):
358
+ if start_page >= number_of_pages:
359
+ break
360
+
361
+ elements = self._split_and_request(full_docs, start_page, num_pages)
362
+ for element in elements:
363
+ result += element.parse_text(self)
364
+ if self.base64_encoding and (base64_encoding := element.base64_encoding):
365
+ base64_encodings.append(base64_encoding)
366
+ if self.coordinates and (coords := element.coordinates):
367
+ coordinates.append(coords)
368
+
369
+ start_page += num_pages
370
+
371
+ else:
372
+ if not blob.path:
373
+ raise ValueError("Blob path is required for non-PDF files.")
374
+
375
+ with open(blob.path, "rb") as f:
376
+ elements = self._get_response({"document": f})
377
+
378
+ for element in elements:
379
+ result += element.parse_text(self)
380
+
381
+ if self.base64_encoding and (base64_encoding := element.base64_encoding):
382
+ base64_encodings.append(base64_encoding)
383
+ if self.coordinates and (coords := element.coordinates):
384
+ coordinates.append(coords)
385
+ metadata: dict[str, object] = {"total_pages": number_of_pages}
386
+ if self.coordinates:
387
+ metadata["coordinates"] = coordinates
388
+ if self.base64_encoding:
389
+ metadata["base64_encodings"] = base64_encodings
390
+
391
+ yield Document(
392
+ page_content=result,
393
+ metadata=metadata,
394
+ )
395
+
396
+ elif self.split == "element":
397
+ if full_docs is not None:
398
+ start_page = 0
399
+ for _ in range(number_of_pages):
400
+ if start_page >= number_of_pages:
401
+ break
402
+
403
+ elements = self._split_and_request(full_docs, start_page, num_pages)
404
+ for element in elements:
405
+ yield self._element_document(element, start_page)
406
+
407
+ start_page += num_pages
408
+
409
+ else:
410
+ if not blob.path:
411
+ raise ValueError("Blob path is required for non-PDF files.")
412
+ with open(blob.path, "rb") as f:
413
+ elements = self._get_response({"document": f})
414
+
415
+ for element in elements:
416
+ yield self._element_document(element)
417
+
418
+ elif self.split == "page":
419
+ if full_docs is not None:
420
+ start_page = 0
421
+ for _ in range(number_of_pages):
422
+ if start_page >= number_of_pages:
423
+ break
424
+
425
+ elements = self._split_and_request(full_docs, start_page, num_pages)
426
+ yield from self._page_document(elements, start_page)
427
+
428
+ start_page += num_pages
429
+ else:
430
+ if not blob.path:
431
+ raise ValueError("Blob path is required for non-PDF files.")
432
+ with open(blob.path, "rb") as f:
433
+ elements = self._get_response({"document": f})
434
+
435
+ yield from self._page_document(elements)
436
+
437
+ else:
438
+ raise ValueError(f"Invalid split type: {self.split}")
@@ -1,4 +1,4 @@
1
- from .playwright_bot import PlayWrightBot
2
- from .utils import MarkdownLink
3
-
4
- __all__ = ["PlayWrightBot", "MarkdownLink"]
1
+ from .playwright_bot import PlayWrightBot
2
+ from .utils import MarkdownLink
3
+
4
+ __all__ = ["PlayWrightBot", "MarkdownLink"]