chatterer 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
chatterer/__init__.py CHANGED
@@ -16,12 +16,13 @@ from .tools import (
16
16
  anything_to_markdown,
17
17
  citation_chunker,
18
18
  get_default_html_to_markdown_options,
19
+ get_youtube_video_details,
19
20
  get_youtube_video_subtitle,
20
21
  html_to_markdown,
22
+ init_upstage_document_parser,
21
23
  init_webpage_to_markdown,
22
24
  pdf_to_text,
23
25
  pyscripts_to_snippets,
24
- get_youtube_video_details,
25
26
  )
26
27
  from .utils import (
27
28
  Base64Image,
@@ -57,4 +58,5 @@ __all__ = [
57
58
  "get_youtube_video_subtitle",
58
59
  "get_youtube_video_details",
59
60
  "interactive_shell",
61
+ "init_upstage_document_parser",
60
62
  ]
@@ -0,0 +1,21 @@
1
+ from .io import (
2
+ BytesReadable,
3
+ BytesWritable,
4
+ FileDescriptorOrPath,
5
+ PathOrReadable,
6
+ Readable,
7
+ StringReadable,
8
+ StringWritable,
9
+ Writable,
10
+ )
11
+
12
+ __all__ = [
13
+ "BytesReadable",
14
+ "BytesWritable",
15
+ "FileDescriptorOrPath",
16
+ "PathOrReadable",
17
+ "Readable",
18
+ "StringReadable",
19
+ "StringWritable",
20
+ "Writable",
21
+ ]
@@ -0,0 +1,19 @@
1
+ import os
2
+ from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
3
+ from typing import TypeAlias
4
+
5
+ # Type aliases for callback functions and file descriptors
6
+ FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
7
+
8
+ # Type aliases for different types of IO objects
9
+ BytesReadable: TypeAlias = BytesIO | BufferedReader
10
+ BytesWritable: TypeAlias = BytesIO | BufferedWriter
11
+ StringReadable: TypeAlias = StringIO | TextIOWrapper
12
+ StringWritable: TypeAlias = StringIO | TextIOWrapper
13
+
14
+ # Combined type aliases for readable and writable objects
15
+ Readable: TypeAlias = BytesReadable | StringReadable
16
+ Writable: TypeAlias = BytesWritable | StringWritable
17
+
18
+ # Type alias for path or readable object
19
+ PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
@@ -465,6 +465,68 @@ def interactive_shell(
465
465
  description="Whether further Python tool calling is needed to answer user query."
466
466
  )
467
467
 
468
+ def respond(messages: list[BaseMessage]) -> str:
469
+ # AI 응답 스트리밍 출력
470
+ console.print("[bold blue]AI:[/bold blue] ", end="")
471
+ response = ""
472
+ for chunk in chatterer.generate_stream(messages=messages):
473
+ response += chunk
474
+ console.print(chunk, end="")
475
+ console.print() # 응답 후 줄바꿈 추가
476
+ return response.strip()
477
+
478
+ def code_session_returning_end_of_turn() -> bool:
479
+ code_session_messages: list[BaseMessage] = []
480
+ while True:
481
+ code_execution: CodeExecutionResult = chatterer.invoke_code_execution(
482
+ messages=context,
483
+ repl_tool=repl_tool,
484
+ prompt_for_code_invoke=prompt_for_code_invoke,
485
+ function_signatures=function_signatures,
486
+ function_reference_prefix=function_reference_prefix,
487
+ function_reference_seperator=function_reference_seperator,
488
+ config=config,
489
+ stop=stop,
490
+ **kwargs,
491
+ )
492
+ if code_execution.code.strip() in ("", "quit", "exit", "pass"):
493
+ return False
494
+
495
+ last_tool_use_message = AIMessage(
496
+ content=f"Executed code:\n```python\n{code_execution.code}\n```\nOutput:\n{code_execution.output}".strip()
497
+ )
498
+ code_session_messages.append(last_tool_use_message)
499
+ console.print("[bold yellow]Executed code:[/bold yellow]")
500
+ console.print(f"[code]{code_execution.code}[/code]")
501
+ console.print("[bold yellow]Output:[/bold yellow]")
502
+ console.print(code_execution.output)
503
+
504
+ decision = chatterer.generate_pydantic(
505
+ response_model=IsFurtherCodeExecutionNeeded,
506
+ messages=augment_prompt_for_toolcall(
507
+ function_signatures=function_signatures,
508
+ messages=context + code_session_messages,
509
+ prompt_for_code_invoke=prompt_for_code_invoke,
510
+ function_reference_prefix=function_reference_prefix,
511
+ function_reference_seperator=function_reference_seperator,
512
+ ),
513
+ )
514
+ review_on_code_execution = decision.review_on_code_execution.strip()
515
+ next_action = decision.next_action.strip()
516
+ console.print("[bold blue]AI:[/bold blue]")
517
+ console.print(f"-[bold yellow]Review on code execution:[/bold yellow] {review_on_code_execution}")
518
+ console.print(f"-[bold yellow]Next Action:[/bold yellow] {next_action}")
519
+ code_session_messages.append(
520
+ AIMessage(
521
+ content=f"- Review upon code execution: {review_on_code_execution}\n- Next Action: {next_action}".strip()
522
+ )
523
+ )
524
+ if not decision.is_further_code_execution_needed:
525
+ response: str = respond(context + code_session_messages)
526
+ context.append(last_tool_use_message)
527
+ context.append(AIMessage(content=response))
528
+ return True
529
+
468
530
  # REPL 도구 초기화
469
531
  if repl_tool is None:
470
532
  repl_tool = get_default_repl_tool()
@@ -504,77 +566,11 @@ def interactive_shell(
504
566
  )
505
567
 
506
568
  # 코드 실행 처리
507
- if decision.is_code_execution_needed:
508
- code_result = chatterer.invoke_code_execution(
509
- messages=context,
510
- repl_tool=repl_tool,
511
- prompt_for_code_invoke=prompt_for_code_invoke,
512
- function_signatures=function_signatures,
513
- function_reference_prefix=function_reference_prefix,
514
- function_reference_seperator=function_reference_seperator,
515
- config=config,
516
- stop=stop,
517
- **kwargs,
518
- )
519
-
520
- if code_result.code.strip() == "pass":
521
- tool_use_message = None
522
- else:
523
- code_session_messages: list[BaseMessage] = []
524
- while True:
525
- code_execution_message = AIMessage(
526
- content=f"Executed code:\n```python\n{code_result.code}\n```\nOutput:\n{code_result.output}".strip()
527
- )
528
- code_session_messages.append(code_execution_message)
529
- console.print("[bold yellow]Executed code:[/bold yellow]")
530
- console.print(f"[code]{code_result.code}[/code]")
531
- console.print("[bold yellow]Output:[/bold yellow]")
532
- console.print(code_result.output)
533
-
534
- decision = chatterer.generate_pydantic(
535
- response_model=IsFurtherCodeExecutionNeeded,
536
- messages=augment_prompt_for_toolcall(
537
- function_signatures=function_signatures,
538
- messages=context + code_session_messages,
539
- prompt_for_code_invoke=prompt_for_code_invoke,
540
- function_reference_prefix=function_reference_prefix,
541
- function_reference_seperator=function_reference_seperator,
542
- ),
543
- )
544
- review_on_code_execution = decision.review_on_code_execution.strip()
545
- next_action = decision.next_action.strip()
546
- console.print("[bold blue]AI:[/bold blue]")
547
- console.print(f"-[bold yellow]Review on code execution:[/bold yellow] {review_on_code_execution}")
548
- console.print(f"-[bold yellow]Next Action:[/bold yellow] {next_action}")
549
- code_session_messages.append(
550
- AIMessage(
551
- content=f"- Review upon code execution: {review_on_code_execution}\n- Next Action: {next_action}".strip()
552
- )
553
- )
554
- if not decision.is_further_code_execution_needed:
555
- tool_use_message = code_execution_message
556
- break
557
- else:
558
- tool_use_message = None
559
-
560
- # 코드 실행 결과 컨텍스트에 추가
561
- if tool_use_message:
562
- context.append(tool_use_message)
569
+ if decision.is_code_execution_needed and code_session_returning_end_of_turn():
570
+ continue
563
571
 
564
572
  # AI 응답 스트리밍 출력
565
- console.print("[bold blue]AI:[/bold blue] ", end="")
566
- response = ""
567
- for chunk in chatterer.generate_stream(messages=context):
568
- response += chunk
569
- console.print(chunk, end="")
570
-
571
- # 전체 응답 처리 후 컨텍스트에 추가
572
- lines = response.split("\n")
573
- if lines:
574
- lines[-1] = lines[-1].rstrip() # 마지막 줄의 오른쪽 공백 제거
575
- response = "\n".join(lines).strip()
576
- context.append(AIMessage(content=response))
577
- console.print() # 응답 후 줄바꿈 추가
573
+ context.append(AIMessage(content=respond(context)))
578
574
 
579
575
 
580
576
  if __name__ == "__main__":
@@ -6,7 +6,7 @@ from .convert_to_text import (
6
6
  pdf_to_text,
7
7
  pyscripts_to_snippets,
8
8
  )
9
- from .youtube import get_youtube_video_subtitle, get_youtube_video_details
9
+ from .youtube import get_youtube_video_details, get_youtube_video_subtitle
10
10
 
11
11
 
12
12
  def init_webpage_to_markdown():
@@ -15,6 +15,12 @@ def init_webpage_to_markdown():
15
15
  return webpage_to_markdown
16
16
 
17
17
 
18
+ def init_upstage_document_parser():
19
+ from . import upstage_document_parser
20
+
21
+ return upstage_document_parser
22
+
23
+
18
24
  __all__ = [
19
25
  "html_to_markdown",
20
26
  "anything_to_markdown",
@@ -25,4 +31,5 @@ __all__ = [
25
31
  "init_webpage_to_markdown",
26
32
  "get_youtube_video_subtitle",
27
33
  "get_youtube_video_details",
34
+ "init_upstage_document_parser",
28
35
  ]
@@ -3,14 +3,11 @@ import importlib
3
3
  import os
4
4
  import re
5
5
  import site
6
- from contextlib import contextmanager, suppress
7
6
  from fnmatch import fnmatch
8
- from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
9
7
  from pathlib import Path
10
8
  from typing import (
11
9
  TYPE_CHECKING,
12
10
  Callable,
13
- Iterator,
14
11
  NamedTuple,
15
12
  NotRequired,
16
13
  Optional,
@@ -20,6 +17,9 @@ from typing import (
20
17
  TypedDict,
21
18
  )
22
19
 
20
+ from ..common_types.io import PathOrReadable
21
+ from ..utils.bytesio import read_bytes_stream
22
+
23
23
  if TYPE_CHECKING:
24
24
  from bs4 import Tag
25
25
  from openai import OpenAI
@@ -38,20 +38,6 @@ type FileTree = dict[str, Optional[FileTree]]
38
38
 
39
39
  # Type aliases for callback functions and file descriptors
40
40
  CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
41
- FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
42
-
43
- # Type aliases for different types of IO objects
44
- BytesReadable: TypeAlias = BytesIO | BufferedReader
45
- BytesWritable: TypeAlias = BytesIO | BufferedWriter
46
- StringReadable: TypeAlias = StringIO | TextIOWrapper
47
- StringWritable: TypeAlias = StringIO | TextIOWrapper
48
-
49
- # Combined type aliases for readable and writable objects
50
- Readable: TypeAlias = BytesReadable | StringReadable
51
- Writable: TypeAlias = BytesWritable | StringWritable
52
-
53
- # Type alias for path or readable object
54
- PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
55
41
 
56
42
 
57
43
  class HtmlToMarkdownOptions(TypedDict):
@@ -240,7 +226,7 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
240
226
  """
241
227
  from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
242
228
 
243
- with _open_stream(path_or_file) as stream:
229
+ with read_bytes_stream(path_or_file) as stream:
244
230
  if stream is None:
245
231
  raise FileNotFoundError(path_or_file)
246
232
  return "\n".join(
@@ -430,34 +416,3 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
430
416
  if p.is_file()
431
417
  ]
432
418
  return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
433
-
434
-
435
- @contextmanager
436
- def _open_stream(
437
- path_or_file: PathOrReadable,
438
- ) -> Iterator[Optional[BytesReadable]]:
439
- """
440
- Context manager for opening a file or using an existing stream.
441
-
442
- Handles different types of input (file paths, byte streams, string streams)
443
- and yields a BytesReadable object that can be used to read binary data.
444
-
445
- Args:
446
- path_or_file: File path or readable object.
447
-
448
- Yields:
449
- Optional[BytesReadable]: A readable binary stream or None if opening fails.
450
- """
451
- stream: Optional[BytesReadable] = None
452
- try:
453
- with suppress(BaseException):
454
- if isinstance(path_or_file, BytesReadable):
455
- stream = path_or_file
456
- elif isinstance(path_or_file, StringReadable):
457
- stream = BytesIO(path_or_file.read().encode("utf-8"))
458
- else:
459
- stream = open(path_or_file, "rb")
460
- yield stream
461
- finally:
462
- if stream is not None:
463
- stream.close()
@@ -0,0 +1,438 @@
1
+ """Adopted from`langchain_upstage.document_parse"""
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ from typing import Iterator, Literal, Optional, cast
8
+
9
+ import requests
10
+ from langchain_core.document_loaders import BaseBlobParser, Blob
11
+ from langchain_core.documents import Document
12
+ from pydantic import BaseModel, Field
13
+ from pypdf import PdfReader, PdfWriter
14
+ from pypdf.errors import PdfReadError
15
+
16
+ from ..common_types.io import BytesReadable
17
+ from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
18
+ from ..utils.image import Base64Image
19
+
20
+ logger = logging.getLogger("pypdf")
21
+ logger.setLevel(logging.ERROR)
22
+
23
+ DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
24
+ DEFAULT_NUM_PAGES = 10
25
+ DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
26
+
27
+ OutputFormat = Literal["text", "html", "markdown"]
28
+ OCR = Literal["auto", "force"]
29
+ SplitType = Literal["none", "page", "element"]
30
+ Category = Literal[
31
+ "paragraph",
32
+ "table",
33
+ "figure",
34
+ "header",
35
+ "footer",
36
+ "caption",
37
+ "equation",
38
+ "heading1",
39
+ "list",
40
+ "index",
41
+ "footnote",
42
+ "chart",
43
+ ]
44
+
45
+
46
+ class Content(BaseModel):
47
+ text: Optional[str] = None
48
+ html: Optional[str] = None
49
+ markdown: Optional[str] = None
50
+
51
+
52
+ class Coordinate(BaseModel):
53
+ x: float
54
+ y: float
55
+
56
+
57
+ class Element(BaseModel):
58
+ category: Category
59
+ content: Content
60
+ coordinates: list[Coordinate] = Field(default_factory=list)
61
+ base64_encoding: str = ""
62
+ id: int
63
+ page: int
64
+
65
+ def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
66
+ output_format: OutputFormat = parser.output_format
67
+ chatterer: Optional[Chatterer] = parser.chatterer
68
+ image_description_instruction: str = parser.image_description_instruction
69
+ output: Optional[str] = None
70
+ if output_format == "text":
71
+ output = self.content.text
72
+ elif output_format == "html":
73
+ output = self.content.html
74
+ elif output_format == "markdown":
75
+ output = self.content.markdown
76
+ if output is None:
77
+ raise ValueError(f"Invalid output format: {output_format}")
78
+
79
+ if chatterer is not None and self.category == "figure" and self.base64_encoding:
80
+ image = Base64Image.from_string(f"data:image/jpeg;base64,{self.base64_encoding}")
81
+ if image is None:
82
+ raise ValueError(f"Invalid base64 encoding for image: {self.base64_encoding}")
83
+ ocr_content = output.removeprefix("![image](/image/placeholder)\n")
84
+ image_description = chatterer.describe_image(
85
+ image.data_uri,
86
+ image_description_instruction
87
+ + f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
88
+ )
89
+ output = f"\n\n<details>\n{image_description}\n</details>\n\n"
90
+
91
+ return output
92
+
93
+
94
+ def get_from_param_or_env(
95
+ key: str,
96
+ param: Optional[str] = None,
97
+ env_key: Optional[str] = None,
98
+ default: Optional[str] = None,
99
+ ) -> str:
100
+ """Get a value from a param or an environment variable."""
101
+ if param is not None:
102
+ return param
103
+ elif env_key and env_key in os.environ and os.environ[env_key]:
104
+ return os.environ[env_key]
105
+ elif default is not None:
106
+ return default
107
+ else:
108
+ raise ValueError(
109
+ f"Did not find {key}, please add an environment variable"
110
+ f" `{env_key}` which contains it, or pass"
111
+ f" `{key}` as a named parameter."
112
+ )
113
+
114
+
115
+ class UpstageDocumentParseParser(BaseBlobParser):
116
+ """Upstage Document Parse Parser.
117
+
118
+ To use, you should have the environment variable `UPSTAGE_API_KEY`
119
+ set with your API key or pass it as a named parameter to the constructor.
120
+
121
+ Example:
122
+ .. code-block:: python
123
+
124
+ from langchain_upstage import UpstageDocumentParseParser
125
+
126
+ loader = UpstageDocumentParseParser(split="page", output_format="text")
127
+ """
128
+
129
+ def __init__(
130
+ self,
131
+ api_key: Optional[str] = None,
132
+ base_url: str = DOCUMENT_PARSE_BASE_URL,
133
+ model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
134
+ split: SplitType = "none",
135
+ ocr: OCR = "auto",
136
+ output_format: OutputFormat = "markdown",
137
+ coordinates: bool = True,
138
+ base64_encoding: list[Category] = [],
139
+ chatterer: Optional[Chatterer] = None,
140
+ image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
141
+ ) -> None:
142
+ """
143
+ Initializes an instance of the Upstage class.
144
+
145
+ Args:
146
+ api_key (str, optional): The API key for accessing the Upstage API.
147
+ Defaults to None, in which case it will be
148
+ fetched from the environment variable
149
+ `UPSTAGE_API_KEY`.
150
+ base_url (str, optional): The base URL for accessing the Upstage API.
151
+ model (str): The model to be used for the document parse.
152
+ Defaults to "document-parse".
153
+ split (SplitType, optional): The type of splitting to be applied.
154
+ Defaults to "none" (no splitting).
155
+ ocr (OCRMode, optional): Extract text from images in the document using OCR.
156
+ If the value is "force", OCR is used to extract
157
+ text from an image. If the value is "auto", text is
158
+ extracted from a PDF. (An error will occur if the
159
+ value is "auto" and the input is NOT in PDF format)
160
+ output_format (OutputFormat, optional): Format of the inference results.
161
+ coordinates (bool, optional): Whether to include the coordinates of the
162
+ OCR in the output.
163
+ base64_encoding (List[Category], optional): The category of the elements to
164
+ be encoded in base64.
165
+ chatterer (Chatterer, optional): The Chatterer instance to use for image
166
+ description.
167
+ image_description_instruction (str, optional): The instruction to use for
168
+ image description.
169
+
170
+
171
+ """
172
+ self.api_key = get_from_param_or_env(
173
+ "UPSTAGE_API_KEY",
174
+ api_key,
175
+ "UPSTAGE_API_KEY",
176
+ os.environ.get("UPSTAGE_API_KEY"),
177
+ )
178
+ self.base_url = base_url
179
+ self.model = model
180
+ self.split: SplitType = split
181
+ self.ocr: OCR = ocr
182
+ self.output_format: OutputFormat = output_format
183
+ self.coordinates = coordinates
184
+ self.base64_encoding: list[Category] = base64_encoding
185
+ self.chatterer = chatterer
186
+ self.image_description_instruction = image_description_instruction
187
+
188
+ def _get_response(self, files: dict[str, BytesReadable]) -> list[Element]:
189
+ """
190
+ Sends a POST request to the API endpoint with the provided files and
191
+ returns the response.
192
+
193
+ Args:
194
+ files (dict): A dictionary containing the files to be sent in the request.
195
+
196
+ Returns:
197
+ dict: The JSON response from the API.
198
+
199
+ Raises:
200
+ ValueError: If there is an error in the API call.
201
+ """
202
+ try:
203
+ headers = {
204
+ "Authorization": f"Bearer {self.api_key}",
205
+ }
206
+ response = requests.post(
207
+ self.base_url,
208
+ headers=headers,
209
+ files=files,
210
+ data={
211
+ "ocr": self.ocr,
212
+ "model": self.model,
213
+ "output_formats": f"['{self.output_format}']",
214
+ "coordinates": self.coordinates,
215
+ "base64_encoding": f"{self.base64_encoding}",
216
+ },
217
+ )
218
+ response.raise_for_status()
219
+ result: object = response.json().get("elements", [])
220
+ if not isinstance(result, list):
221
+ raise ValueError(f"Failed to parse JSON data: {result}")
222
+ result = cast(list[object], result)
223
+ return [Element.model_validate(element) for element in result]
224
+ except requests.HTTPError as e:
225
+ raise ValueError(f"HTTP error: {e.response.text}")
226
+ except requests.RequestException as e:
227
+ # Handle any request-related exceptions
228
+ raise ValueError(f"Failed to send request: {e}")
229
+ except json.JSONDecodeError as e:
230
+ # Handle JSON decode errors
231
+ raise ValueError(f"Failed to decode JSON response: {e}")
232
+ except Exception as e:
233
+ # Handle any other exceptions
234
+ raise ValueError(f"An error occurred: {e}")
235
+
236
+ def _split_and_request(
237
+ self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
238
+ ) -> list[Element]:
239
+ """
240
+ Splits the full pdf document into partial pages and sends a request to the
241
+ server.
242
+
243
+ Args:
244
+ full_docs (PdfReader): The full document to be split and requested.
245
+ start_page (int): The starting page number for splitting the document.
246
+ num_pages (int, optional): The number of pages to split the document
247
+ into.
248
+ Defaults to DEFAULT_NUMBER_OF_PAGE.
249
+
250
+ Returns:
251
+ response: The response from the server.
252
+ """
253
+ merger = PdfWriter()
254
+ merger.append(
255
+ full_docs,
256
+ pages=(start_page, min(start_page + num_pages, full_docs.get_num_pages())),
257
+ )
258
+
259
+ with io.BytesIO() as buffer:
260
+ merger.write(buffer)
261
+ buffer.seek(0)
262
+ return self._get_response({"document": buffer})
263
+
264
+ def _element_document(self, element: Element, start_page: int = 0) -> Document:
265
+ """
266
+ Converts an elements into a Document object.
267
+
268
+ Args:
269
+ elements (Dict) : The elements to convert.
270
+ start_page (int): The starting page number for splitting the document.
271
+ This number starts from zero.
272
+
273
+ Returns:
274
+ A list containing a single Document object.
275
+
276
+ """
277
+ metadata: dict[str, object] = element.model_dump(exclude_none=True)
278
+ metadata["page"] = element.page + start_page
279
+ return Document(
280
+ page_content=element.parse_text(self),
281
+ metadata=metadata,
282
+ )
283
+
284
+ def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
285
+ """
286
+ Combines elements with the same page number into a single Document object.
287
+
288
+ Args:
289
+ elements (List): A list of elements containing page numbers.
290
+ start_page (int): The starting page number for splitting the document.
291
+ This number starts from zero.
292
+
293
+ Returns:
294
+ List[Document]: A list of Document objects, each representing a page
295
+ with its content and metadata.
296
+ """
297
+ documents: list[Document] = []
298
+ pages: list[int] = sorted(set(map(lambda x: x.page, elements)))
299
+ page_group: list[list[Element]] = [[element for element in elements if element.page == x] for x in pages]
300
+ for group in page_group:
301
+ metadata: dict[str, object] = {
302
+ "page": group[0].page + start_page,
303
+ }
304
+ if self.base64_encoding:
305
+ metadata["base64_encodings"] = [element.base64_encoding for element in group if element.base64_encoding]
306
+ if self.coordinates:
307
+ metadata["coordinates"] = [element.coordinates for element in group if element.coordinates]
308
+ documents.append(
309
+ Document(
310
+ page_content=" ".join(element.parse_text(self) for element in group),
311
+ metadata=metadata,
312
+ )
313
+ )
314
+
315
+ return documents
316
+
317
+ def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
318
+ """
319
+ Lazily parses a document and yields Document objects based on the specified
320
+ split type.
321
+
322
+ Args:
323
+ blob (Blob): The input document blob to parse.
324
+ is_batch (bool, optional): Whether to parse the document in batches.
325
+ Defaults to False (single page parsing)
326
+
327
+ Yields:
328
+ Document: The parsed document object.
329
+
330
+ Raises:
331
+ ValueError: If an invalid split type is provided.
332
+
333
+ """
334
+
335
+ if is_batch:
336
+ num_pages = DEFAULT_NUM_PAGES
337
+ else:
338
+ num_pages = 1
339
+
340
+ full_docs: Optional[PdfReader] = None
341
+ try:
342
+ full_docs = PdfReader(str(blob.path))
343
+ number_of_pages = full_docs.get_num_pages()
344
+ except PdfReadError:
345
+ number_of_pages = 1
346
+ except Exception as e:
347
+ raise ValueError(f"Failed to read PDF file: {e}")
348
+
349
+ if self.split == "none":
350
+ result = ""
351
+ base64_encodings: list[str] = []
352
+ coordinates: list[list[Coordinate]] = []
353
+
354
+ if full_docs is not None:
355
+ start_page = 0
356
+ num_pages = DEFAULT_NUM_PAGES
357
+ for _ in range(number_of_pages):
358
+ if start_page >= number_of_pages:
359
+ break
360
+
361
+ elements = self._split_and_request(full_docs, start_page, num_pages)
362
+ for element in elements:
363
+ result += element.parse_text(self)
364
+ if self.base64_encoding and (base64_encoding := element.base64_encoding):
365
+ base64_encodings.append(base64_encoding)
366
+ if self.coordinates and (coords := element.coordinates):
367
+ coordinates.append(coords)
368
+
369
+ start_page += num_pages
370
+
371
+ else:
372
+ if not blob.path:
373
+ raise ValueError("Blob path is required for non-PDF files.")
374
+
375
+ with open(blob.path, "rb") as f:
376
+ elements = self._get_response({"document": f})
377
+
378
+ for element in elements:
379
+ result += element.parse_text(self)
380
+
381
+ if self.base64_encoding and (base64_encoding := element.base64_encoding):
382
+ base64_encodings.append(base64_encoding)
383
+ if self.coordinates and (coords := element.coordinates):
384
+ coordinates.append(coords)
385
+ metadata: dict[str, object] = {"total_pages": number_of_pages}
386
+ if self.coordinates:
387
+ metadata["coordinates"] = coordinates
388
+ if self.base64_encoding:
389
+ metadata["base64_encodings"] = base64_encodings
390
+
391
+ yield Document(
392
+ page_content=result,
393
+ metadata=metadata,
394
+ )
395
+
396
+ elif self.split == "element":
397
+ if full_docs is not None:
398
+ start_page = 0
399
+ for _ in range(number_of_pages):
400
+ if start_page >= number_of_pages:
401
+ break
402
+
403
+ elements = self._split_and_request(full_docs, start_page, num_pages)
404
+ for element in elements:
405
+ yield self._element_document(element, start_page)
406
+
407
+ start_page += num_pages
408
+
409
+ else:
410
+ if not blob.path:
411
+ raise ValueError("Blob path is required for non-PDF files.")
412
+ with open(blob.path, "rb") as f:
413
+ elements = self._get_response({"document": f})
414
+
415
+ for element in elements:
416
+ yield self._element_document(element)
417
+
418
+ elif self.split == "page":
419
+ if full_docs is not None:
420
+ start_page = 0
421
+ for _ in range(number_of_pages):
422
+ if start_page >= number_of_pages:
423
+ break
424
+
425
+ elements = self._split_and_request(full_docs, start_page, num_pages)
426
+ yield from self._page_document(elements, start_page)
427
+
428
+ start_page += num_pages
429
+ else:
430
+ if not blob.path:
431
+ raise ValueError("Blob path is required for non-PDF files.")
432
+ with open(blob.path, "rb") as f:
433
+ elements = self._get_response({"document": f})
434
+
435
+ yield from self._page_document(elements)
436
+
437
+ else:
438
+ raise ValueError(f"Invalid split type: {self.split}")
@@ -277,6 +277,7 @@ def get_image_url_and_markdown_links(
277
277
 
278
278
  image_data = Base64Image.from_url_or_path(markdown_link.url, headers=headers, config=config)
279
279
  if not image_data:
280
+ image_matches.setdefault(None, []).append(markdown_link)
280
281
  continue
281
282
  image_matches.setdefault(image_data, []).append(markdown_link)
282
283
  return image_matches
@@ -294,6 +295,7 @@ async def aget_image_url_and_markdown_links(
294
295
  markdown_link.url, headers=headers, config=config, return_coro=True
295
296
  )
296
297
  if not image_data:
298
+ image_matches.setdefault(None, []).append(markdown_link)
297
299
  continue
298
300
  image_matches.setdefault(image_data, []).append(markdown_link)
299
301
  return image_matches
@@ -306,7 +308,10 @@ def replace_images(
306
308
  for image_description, markdown_links in image_description_and_references.items():
307
309
  for markdown_link in markdown_links:
308
310
  if image_description is None:
309
- replacements.append((markdown_link, markdown_link.link_markdown))
311
+ if markdown_link.type == "link":
312
+ replacements.append((markdown_link, markdown_link.link_markdown))
313
+ elif markdown_link.type == "image":
314
+ replacements.append((markdown_link, f"![{markdown_link.inline_text}](...)"))
310
315
  else:
311
316
  replacements.append((
312
317
  markdown_link,
@@ -0,0 +1,59 @@
1
+ import os
2
+ from contextlib import contextmanager, suppress
3
+ from io import BytesIO
4
+ from typing import Iterator, Optional
5
+
6
+ from ..common_types.io import BytesReadable, PathOrReadable, StringReadable
7
+
8
+
9
+ @contextmanager
10
+ def read_bytes_stream(
11
+ path_or_file: PathOrReadable,
12
+ assume_pathlike_bytes_as_path: bool = False,
13
+ assume_pathlike_string_as_path: bool = True,
14
+ ) -> Iterator[Optional[BytesReadable]]:
15
+ """
16
+ Context manager for opening a file or using an existing stream.
17
+
18
+ Handles different types of input (file paths, byte streams, string streams)
19
+ and yields a BytesReadable object that can be used to read binary data.
20
+
21
+ Args:
22
+ path_or_file: File path or readable object.
23
+ assume_pathlike_bytes_as_path: If True, assume bytes-like objects are file paths. Else, treat as data itself.
24
+ assume_pathlike_string_as_path: If True, assume string-like objects are file paths. Else, treat as data itself.
25
+
26
+ Yields:
27
+ Optional[BytesReadable]: A readable binary stream or None if opening fails.
28
+ """
29
+ stream: Optional[BytesReadable] = None
30
+ should_close: bool = True # Whether the stream should be closed after use
31
+ try:
32
+ with suppress(BaseException):
33
+ if isinstance(path_or_file, BytesReadable):
34
+ # Assume the input is already a bytes stream
35
+ # NOTE: Delivers itself, so shouldn't be closed.
36
+ stream = path_or_file
37
+ should_close = False
38
+ elif isinstance(path_or_file, StringReadable):
39
+ # Convert the string stream to bytes stream
40
+ stream = BytesIO(path_or_file.read().encode("utf-8"))
41
+ elif isinstance(path_or_file, bytes):
42
+ # Convert the bytes-like object to bytes stream
43
+ if assume_pathlike_bytes_as_path and os.path.exists(path_or_file):
44
+ stream = open(path_or_file, "rb")
45
+ else:
46
+ stream = BytesIO(path_or_file)
47
+ elif isinstance(path_or_file, str):
48
+ # Convert the file path to bytes stream
49
+ if assume_pathlike_string_as_path and os.path.exists(path_or_file):
50
+ stream = open(path_or_file, "rb")
51
+ else:
52
+ stream = BytesIO(path_or_file.encode("utf-8"))
53
+ else:
54
+ # Assume the input is a file descriptor or path
55
+ stream = open(path_or_file, "rb")
56
+ yield stream
57
+ finally:
58
+ if stream is not None and should_close:
59
+ stream.close()
chatterer/utils/image.py CHANGED
@@ -3,8 +3,8 @@ from __future__ import annotations
3
3
  import re
4
4
  from base64 import b64encode
5
5
  from io import BytesIO
6
+ from logging import getLogger
6
7
  from pathlib import Path
7
- from traceback import print_exc
8
8
  from typing import (
9
9
  Awaitable,
10
10
  ClassVar,
@@ -28,6 +28,7 @@ from PIL.Image import Resampling
28
28
  from PIL.Image import open as image_open
29
29
  from pydantic import BaseModel
30
30
 
31
+ logger = getLogger(__name__)
31
32
  ImageType: TypeAlias = Literal["jpeg", "jpg", "png", "gif", "webp", "bmp"]
32
33
 
33
34
 
@@ -123,7 +124,10 @@ class Base64Image(BaseModel):
123
124
  if return_coro:
124
125
  return cls._afetch_remote_image(url_or_path, headers, config)
125
126
  return cls._fetch_remote_image(url_or_path, headers, config)
126
- return cls._process_local_image(Path(url_or_path), config)
127
+ try:
128
+ return cls._process_local_image(Path(url_or_path), config)
129
+ except Exception:
130
+ return None
127
131
 
128
132
  @property
129
133
  def data_uri(self) -> str:
@@ -167,7 +171,7 @@ class Base64Image(BaseModel):
167
171
  max_size_mb = config.get("max_size_mb", float("inf"))
168
172
  image_size_mb = len(image_data) / (1024 * 1024)
169
173
  if image_size_mb > max_size_mb:
170
- print(f"Image too large: {image_size_mb:.2f} MB > {max_size_mb} MB")
174
+ logger.error(f"Image too large: {image_size_mb:.2f} MB > {max_size_mb} MB")
171
175
  return None
172
176
 
173
177
  # 2) Pillow로 이미지 열기
@@ -182,7 +186,7 @@ class Base64Image(BaseModel):
182
186
  # min_largest_side 기준
183
187
  min_largest_side = config.get("min_largest_side", 1)
184
188
  if largest_side < min_largest_side:
185
- print(f"Image too small: {largest_side} < {min_largest_side}")
189
+ logger.error(f"Image too small: {largest_side} < {min_largest_side}")
186
190
  return None
187
191
 
188
192
  # resize 로직
@@ -200,7 +204,7 @@ class Base64Image(BaseModel):
200
204
  pil_format: str = (im.format or "").lower()
201
205
  allowed_formats: Sequence[ImageType] = config.get("formats", [])
202
206
  if not cls._verify_ext(pil_format, allowed_formats):
203
- print(f"Invalid format: {pil_format} not in {allowed_formats}")
207
+ logger.error(f"Invalid format: {pil_format} not in {allowed_formats}")
204
208
  return None
205
209
 
206
210
  # 다시 bytes 로 저장
@@ -210,7 +214,6 @@ class Base64Image(BaseModel):
210
214
  final_bytes = output_buffer.read()
211
215
 
212
216
  except Exception:
213
- print_exc()
214
217
  return None
215
218
 
216
219
  # 최종 base64 인코딩
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chatterer
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: The highest-level interface for various LLM APIs.
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -15,18 +15,19 @@ Requires-Dist: markdownify>=1.1.0; extra == "conversion"
15
15
  Requires-Dist: commonmark>=0.9.1; extra == "conversion"
16
16
  Requires-Dist: playwright>=1.50.0; extra == "conversion"
17
17
  Requires-Dist: pillow>=11.1.0; extra == "conversion"
18
- Requires-Dist: mistune>=3.1.2; extra == "conversion"
19
- Requires-Dist: markitdown>=0.0.2; extra == "conversion"
18
+ Requires-Dist: mistune>=3.1.3; extra == "conversion"
19
+ Requires-Dist: markitdown>=0.1.1; extra == "conversion"
20
20
  Requires-Dist: pymupdf>=1.25.4; extra == "conversion"
21
- Requires-Dist: youtube-transcript-api>=1.0.2; extra == "conversion"
21
+ Requires-Dist: youtube-transcript-api>=1.0.3; extra == "conversion"
22
+ Requires-Dist: pypdf>=5.4.0; extra == "conversion"
22
23
  Provides-Extra: langchain
23
24
  Requires-Dist: chatterer[langchain-providers]; extra == "langchain"
24
25
  Requires-Dist: langchain-experimental>=0.3.4; extra == "langchain"
25
26
  Provides-Extra: langchain-providers
26
- Requires-Dist: langchain-openai>=0.3.7; extra == "langchain-providers"
27
- Requires-Dist: langchain-anthropic>=0.3.8; extra == "langchain-providers"
28
- Requires-Dist: langchain-google-genai>=2.0.10; extra == "langchain-providers"
29
- Requires-Dist: langchain-ollama>=0.2.3; extra == "langchain-providers"
27
+ Requires-Dist: langchain-openai>=0.3.11; extra == "langchain-providers"
28
+ Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain-providers"
29
+ Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain-providers"
30
+ Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain-providers"
30
31
  Provides-Extra: all
31
32
  Requires-Dist: chatterer[langchain]; extra == "all"
32
33
  Requires-Dist: chatterer[conversion]; extra == "all"
@@ -1,12 +1,15 @@
1
- chatterer/__init__.py,sha256=BPgCQ6VWGBXSh8xJr_0bpM0hcOOUz0KoxcKxOd9GYyI,1388
2
- chatterer/language_model.py,sha256=DX_mU855JHHqE0gdnieWZNOwX1BjIO4VK4EightRL3w,24353
1
+ chatterer/__init__.py,sha256=444C_hySiaJNBsG40l6d_xYY_KT5rBiQLR1mgzOc19A,1460
2
+ chatterer/language_model.py,sha256=gjZC8SyTNZ0rQke_SongcfQid26coLtE7lguhdoFuX8,24078
3
3
  chatterer/messages.py,sha256=OtbZ3two0LUQ4PXES97FDIBUSO3IcMHdFV1VFkDL2mI,229
4
4
  chatterer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ chatterer/common_types/__init__.py,sha256=jfS6m5UANSvGjzQ_nzYDpryn5uZqNb06-4xCsQ2C_lw,376
6
+ chatterer/common_types/io.py,sha256=fetiyi1suZ3NF2mj5k5KDLJLGKS1n4J-5UmH7JN36g8,817
5
7
  chatterer/strategies/__init__.py,sha256=SdOggbmHpw4f7Njwy-T8q64e91OLOUp1k0a0ozZd4qI,221
6
8
  chatterer/strategies/atom_of_thoughts.py,sha256=CygOCLu5vLk-fzY9O-iE3qLShfjD7iY40ks9jH4ULBM,40872
7
9
  chatterer/strategies/base.py,sha256=b2gMPqodp97OP1dkHfj0UqixjdjVhmTw_V5qJ7i2S6g,427
8
- chatterer/tools/__init__.py,sha256=hmWIuLJWotGQodL__i4LLbHdXe7Nl5uKHqNke9tHMro,705
9
- chatterer/tools/convert_to_text.py,sha256=kBqxCJ0IoiAw2eiPYqep_SPZm-TtYKF7mdACLsWQUuI,15915
10
+ chatterer/tools/__init__.py,sha256=CK6hHDmgqHg70k6hHcMdHv5qutKBfReaNy2c4EaKOns,864
11
+ chatterer/tools/convert_to_text.py,sha256=gfeMDogvDg8G4ZcRC3m4yU24_0-r_cl5gXHwg2Ym9p4,14222
12
+ chatterer/tools/upstage_document_parser.py,sha256=s0mtukC93y7zwS94gjyvgcvCsr2fAUzt1LZPWYxdF1Q,17165
10
13
  chatterer/tools/youtube.py,sha256=GhyE05JBF_eos01A_N-X5tZv4wQJ--IjErBbEBeNBpQ,6037
11
14
  chatterer/tools/citation_chunking/__init__.py,sha256=gG7Fnkkp28UpcWMbfMY_4gqzZSZ8QzlhalHBoeoq7K0,82
12
15
  chatterer/tools/citation_chunking/chunks.py,sha256=50Dpa43RaYftlNox8tM1qI8htZ3_AJ9Uyyn02WsmxYk,2173
@@ -17,11 +20,12 @@ chatterer/tools/citation_chunking/reference.py,sha256=uRKufkU41Zedz6MQUCy-aCk4Rw
17
20
  chatterer/tools/citation_chunking/utils.py,sha256=M4pH2-UIE1VLzQLXDqjEe4L3Xcy0e0KhAP3I2U2BNms,6348
18
21
  chatterer/tools/webpage_to_markdown/__init__.py,sha256=bHH4qfnXyw8Zz-yBPLaTezF1sh9njvNBJmhBVtcpjsA,123
19
22
  chatterer/tools/webpage_to_markdown/playwright_bot.py,sha256=yP0KixYZNQ4Kn_ZCFDI3mVyBD_DpUGfqgklpaGJUTCU,27496
20
- chatterer/tools/webpage_to_markdown/utils.py,sha256=ZLUU94imYciEdynD2K7Dmcsbt8BVQTaOP56Ba6DAFvk,12593
23
+ chatterer/tools/webpage_to_markdown/utils.py,sha256=TK88-ReOUTs8njIGDY-nCNNVCPwHCVb6nV5wNuDxx2Q,12938
21
24
  chatterer/utils/__init__.py,sha256=8nzpFJKU_wSRPH6LBP6HRBotPMrSl_VO9UlmFprTrK0,334
25
+ chatterer/utils/bytesio.py,sha256=3MC2atOOFKo5YxuReo_y_t8Wem9p2Y1ahC5M2lGclwI,2618
22
26
  chatterer/utils/code_agent.py,sha256=UaWdeGzJMPzRSFy9yrxuveBJsvOPSa0te6OuE18bees,5143
23
- chatterer/utils/image.py,sha256=1imiyq6TB9NIIGx3zAA2OwMWuXlifYIAjwfWRWa4WIM,10858
24
- chatterer-0.1.11.dist-info/METADATA,sha256=S3hRkxG1DlFc_NGrra1xhniiCDDVoVrow2N96OJy8i0,4458
25
- chatterer-0.1.11.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
26
- chatterer-0.1.11.dist-info/top_level.txt,sha256=7nSQKP0bHxPRc7HyzdbKsJdkvPgYD0214o6slRizv9s,10
27
- chatterer-0.1.11.dist-info/RECORD,,
27
+ chatterer/utils/image.py,sha256=mBqVBAhIpe1PovxKMPJ77GHcUsePQlgIWW2FZgh-6Z4,10952
28
+ chatterer-0.1.13.dist-info/METADATA,sha256=AAhg295_57oJYop5P0cBHQiiP1ArzP_Zo-8Lq7APzYY,4511
29
+ chatterer-0.1.13.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
30
+ chatterer-0.1.13.dist-info/top_level.txt,sha256=7nSQKP0bHxPRc7HyzdbKsJdkvPgYD0214o6slRizv9s,10
31
+ chatterer-0.1.13.dist-info/RECORD,,