chatterer 0.1.11__tar.gz → 0.1.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatterer-0.1.11 → chatterer-0.1.13}/PKG-INFO +9 -8
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/__init__.py +3 -1
- chatterer-0.1.13/chatterer/common_types/__init__.py +21 -0
- chatterer-0.1.13/chatterer/common_types/io.py +19 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/language_model.py +65 -69
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/__init__.py +8 -1
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/convert_to_text.py +4 -49
- chatterer-0.1.13/chatterer/tools/upstage_document_parser.py +438 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/webpage_to_markdown/utils.py +6 -1
- chatterer-0.1.13/chatterer/utils/bytesio.py +59 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/utils/image.py +9 -6
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer.egg-info/PKG-INFO +9 -8
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer.egg-info/SOURCES.txt +4 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer.egg-info/requires.txt +8 -7
- {chatterer-0.1.11 → chatterer-0.1.13}/pyproject.toml +9 -8
- {chatterer-0.1.11 → chatterer-0.1.13}/README.md +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/messages.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/py.typed +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/strategies/__init__.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/strategies/atom_of_thoughts.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/strategies/base.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/__init__.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/chunks.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/citation_chunker.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/citations.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/prompt.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/reference.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/citation_chunking/utils.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/webpage_to_markdown/__init__.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/webpage_to_markdown/playwright_bot.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/tools/youtube.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/utils/__init__.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer/utils/code_agent.py +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer.egg-info/dependency_links.txt +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/chatterer.egg-info/top_level.txt +0 -0
- {chatterer-0.1.11 → chatterer-0.1.13}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chatterer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: The highest-level interface for various LLM APIs.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -15,18 +15,19 @@ Requires-Dist: markdownify>=1.1.0; extra == "conversion"
|
|
15
15
|
Requires-Dist: commonmark>=0.9.1; extra == "conversion"
|
16
16
|
Requires-Dist: playwright>=1.50.0; extra == "conversion"
|
17
17
|
Requires-Dist: pillow>=11.1.0; extra == "conversion"
|
18
|
-
Requires-Dist: mistune>=3.1.
|
19
|
-
Requires-Dist: markitdown>=0.
|
18
|
+
Requires-Dist: mistune>=3.1.3; extra == "conversion"
|
19
|
+
Requires-Dist: markitdown>=0.1.1; extra == "conversion"
|
20
20
|
Requires-Dist: pymupdf>=1.25.4; extra == "conversion"
|
21
|
-
Requires-Dist: youtube-transcript-api>=1.0.
|
21
|
+
Requires-Dist: youtube-transcript-api>=1.0.3; extra == "conversion"
|
22
|
+
Requires-Dist: pypdf>=5.4.0; extra == "conversion"
|
22
23
|
Provides-Extra: langchain
|
23
24
|
Requires-Dist: chatterer[langchain-providers]; extra == "langchain"
|
24
25
|
Requires-Dist: langchain-experimental>=0.3.4; extra == "langchain"
|
25
26
|
Provides-Extra: langchain-providers
|
26
|
-
Requires-Dist: langchain-openai>=0.3.
|
27
|
-
Requires-Dist: langchain-anthropic>=0.3.
|
28
|
-
Requires-Dist: langchain-google-genai>=2.
|
29
|
-
Requires-Dist: langchain-ollama>=0.
|
27
|
+
Requires-Dist: langchain-openai>=0.3.11; extra == "langchain-providers"
|
28
|
+
Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain-providers"
|
29
|
+
Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain-providers"
|
30
|
+
Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain-providers"
|
30
31
|
Provides-Extra: all
|
31
32
|
Requires-Dist: chatterer[langchain]; extra == "all"
|
32
33
|
Requires-Dist: chatterer[conversion]; extra == "all"
|
@@ -16,12 +16,13 @@ from .tools import (
|
|
16
16
|
anything_to_markdown,
|
17
17
|
citation_chunker,
|
18
18
|
get_default_html_to_markdown_options,
|
19
|
+
get_youtube_video_details,
|
19
20
|
get_youtube_video_subtitle,
|
20
21
|
html_to_markdown,
|
22
|
+
init_upstage_document_parser,
|
21
23
|
init_webpage_to_markdown,
|
22
24
|
pdf_to_text,
|
23
25
|
pyscripts_to_snippets,
|
24
|
-
get_youtube_video_details,
|
25
26
|
)
|
26
27
|
from .utils import (
|
27
28
|
Base64Image,
|
@@ -57,4 +58,5 @@ __all__ = [
|
|
57
58
|
"get_youtube_video_subtitle",
|
58
59
|
"get_youtube_video_details",
|
59
60
|
"interactive_shell",
|
61
|
+
"init_upstage_document_parser",
|
60
62
|
]
|
@@ -0,0 +1,21 @@
|
|
1
|
+
from .io import (
|
2
|
+
BytesReadable,
|
3
|
+
BytesWritable,
|
4
|
+
FileDescriptorOrPath,
|
5
|
+
PathOrReadable,
|
6
|
+
Readable,
|
7
|
+
StringReadable,
|
8
|
+
StringWritable,
|
9
|
+
Writable,
|
10
|
+
)
|
11
|
+
|
12
|
+
__all__ = [
|
13
|
+
"BytesReadable",
|
14
|
+
"BytesWritable",
|
15
|
+
"FileDescriptorOrPath",
|
16
|
+
"PathOrReadable",
|
17
|
+
"Readable",
|
18
|
+
"StringReadable",
|
19
|
+
"StringWritable",
|
20
|
+
"Writable",
|
21
|
+
]
|
@@ -0,0 +1,19 @@
|
|
1
|
+
import os
|
2
|
+
from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
|
3
|
+
from typing import TypeAlias
|
4
|
+
|
5
|
+
# Type aliases for callback functions and file descriptors
|
6
|
+
FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
|
7
|
+
|
8
|
+
# Type aliases for different types of IO objects
|
9
|
+
BytesReadable: TypeAlias = BytesIO | BufferedReader
|
10
|
+
BytesWritable: TypeAlias = BytesIO | BufferedWriter
|
11
|
+
StringReadable: TypeAlias = StringIO | TextIOWrapper
|
12
|
+
StringWritable: TypeAlias = StringIO | TextIOWrapper
|
13
|
+
|
14
|
+
# Combined type aliases for readable and writable objects
|
15
|
+
Readable: TypeAlias = BytesReadable | StringReadable
|
16
|
+
Writable: TypeAlias = BytesWritable | StringWritable
|
17
|
+
|
18
|
+
# Type alias for path or readable object
|
19
|
+
PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
|
@@ -465,6 +465,68 @@ def interactive_shell(
|
|
465
465
|
description="Whether further Python tool calling is needed to answer user query."
|
466
466
|
)
|
467
467
|
|
468
|
+
def respond(messages: list[BaseMessage]) -> str:
|
469
|
+
# AI 응답 스트리밍 출력
|
470
|
+
console.print("[bold blue]AI:[/bold blue] ", end="")
|
471
|
+
response = ""
|
472
|
+
for chunk in chatterer.generate_stream(messages=messages):
|
473
|
+
response += chunk
|
474
|
+
console.print(chunk, end="")
|
475
|
+
console.print() # 응답 후 줄바꿈 추가
|
476
|
+
return response.strip()
|
477
|
+
|
478
|
+
def code_session_returning_end_of_turn() -> bool:
|
479
|
+
code_session_messages: list[BaseMessage] = []
|
480
|
+
while True:
|
481
|
+
code_execution: CodeExecutionResult = chatterer.invoke_code_execution(
|
482
|
+
messages=context,
|
483
|
+
repl_tool=repl_tool,
|
484
|
+
prompt_for_code_invoke=prompt_for_code_invoke,
|
485
|
+
function_signatures=function_signatures,
|
486
|
+
function_reference_prefix=function_reference_prefix,
|
487
|
+
function_reference_seperator=function_reference_seperator,
|
488
|
+
config=config,
|
489
|
+
stop=stop,
|
490
|
+
**kwargs,
|
491
|
+
)
|
492
|
+
if code_execution.code.strip() in ("", "quit", "exit", "pass"):
|
493
|
+
return False
|
494
|
+
|
495
|
+
last_tool_use_message = AIMessage(
|
496
|
+
content=f"Executed code:\n```python\n{code_execution.code}\n```\nOutput:\n{code_execution.output}".strip()
|
497
|
+
)
|
498
|
+
code_session_messages.append(last_tool_use_message)
|
499
|
+
console.print("[bold yellow]Executed code:[/bold yellow]")
|
500
|
+
console.print(f"[code]{code_execution.code}[/code]")
|
501
|
+
console.print("[bold yellow]Output:[/bold yellow]")
|
502
|
+
console.print(code_execution.output)
|
503
|
+
|
504
|
+
decision = chatterer.generate_pydantic(
|
505
|
+
response_model=IsFurtherCodeExecutionNeeded,
|
506
|
+
messages=augment_prompt_for_toolcall(
|
507
|
+
function_signatures=function_signatures,
|
508
|
+
messages=context + code_session_messages,
|
509
|
+
prompt_for_code_invoke=prompt_for_code_invoke,
|
510
|
+
function_reference_prefix=function_reference_prefix,
|
511
|
+
function_reference_seperator=function_reference_seperator,
|
512
|
+
),
|
513
|
+
)
|
514
|
+
review_on_code_execution = decision.review_on_code_execution.strip()
|
515
|
+
next_action = decision.next_action.strip()
|
516
|
+
console.print("[bold blue]AI:[/bold blue]")
|
517
|
+
console.print(f"-[bold yellow]Review on code execution:[/bold yellow] {review_on_code_execution}")
|
518
|
+
console.print(f"-[bold yellow]Next Action:[/bold yellow] {next_action}")
|
519
|
+
code_session_messages.append(
|
520
|
+
AIMessage(
|
521
|
+
content=f"- Review upon code execution: {review_on_code_execution}\n- Next Action: {next_action}".strip()
|
522
|
+
)
|
523
|
+
)
|
524
|
+
if not decision.is_further_code_execution_needed:
|
525
|
+
response: str = respond(context + code_session_messages)
|
526
|
+
context.append(last_tool_use_message)
|
527
|
+
context.append(AIMessage(content=response))
|
528
|
+
return True
|
529
|
+
|
468
530
|
# REPL 도구 초기화
|
469
531
|
if repl_tool is None:
|
470
532
|
repl_tool = get_default_repl_tool()
|
@@ -504,77 +566,11 @@ def interactive_shell(
|
|
504
566
|
)
|
505
567
|
|
506
568
|
# 코드 실행 처리
|
507
|
-
if decision.is_code_execution_needed:
|
508
|
-
|
509
|
-
messages=context,
|
510
|
-
repl_tool=repl_tool,
|
511
|
-
prompt_for_code_invoke=prompt_for_code_invoke,
|
512
|
-
function_signatures=function_signatures,
|
513
|
-
function_reference_prefix=function_reference_prefix,
|
514
|
-
function_reference_seperator=function_reference_seperator,
|
515
|
-
config=config,
|
516
|
-
stop=stop,
|
517
|
-
**kwargs,
|
518
|
-
)
|
519
|
-
|
520
|
-
if code_result.code.strip() == "pass":
|
521
|
-
tool_use_message = None
|
522
|
-
else:
|
523
|
-
code_session_messages: list[BaseMessage] = []
|
524
|
-
while True:
|
525
|
-
code_execution_message = AIMessage(
|
526
|
-
content=f"Executed code:\n```python\n{code_result.code}\n```\nOutput:\n{code_result.output}".strip()
|
527
|
-
)
|
528
|
-
code_session_messages.append(code_execution_message)
|
529
|
-
console.print("[bold yellow]Executed code:[/bold yellow]")
|
530
|
-
console.print(f"[code]{code_result.code}[/code]")
|
531
|
-
console.print("[bold yellow]Output:[/bold yellow]")
|
532
|
-
console.print(code_result.output)
|
533
|
-
|
534
|
-
decision = chatterer.generate_pydantic(
|
535
|
-
response_model=IsFurtherCodeExecutionNeeded,
|
536
|
-
messages=augment_prompt_for_toolcall(
|
537
|
-
function_signatures=function_signatures,
|
538
|
-
messages=context + code_session_messages,
|
539
|
-
prompt_for_code_invoke=prompt_for_code_invoke,
|
540
|
-
function_reference_prefix=function_reference_prefix,
|
541
|
-
function_reference_seperator=function_reference_seperator,
|
542
|
-
),
|
543
|
-
)
|
544
|
-
review_on_code_execution = decision.review_on_code_execution.strip()
|
545
|
-
next_action = decision.next_action.strip()
|
546
|
-
console.print("[bold blue]AI:[/bold blue]")
|
547
|
-
console.print(f"-[bold yellow]Review on code execution:[/bold yellow] {review_on_code_execution}")
|
548
|
-
console.print(f"-[bold yellow]Next Action:[/bold yellow] {next_action}")
|
549
|
-
code_session_messages.append(
|
550
|
-
AIMessage(
|
551
|
-
content=f"- Review upon code execution: {review_on_code_execution}\n- Next Action: {next_action}".strip()
|
552
|
-
)
|
553
|
-
)
|
554
|
-
if not decision.is_further_code_execution_needed:
|
555
|
-
tool_use_message = code_execution_message
|
556
|
-
break
|
557
|
-
else:
|
558
|
-
tool_use_message = None
|
559
|
-
|
560
|
-
# 코드 실행 결과 컨텍스트에 추가
|
561
|
-
if tool_use_message:
|
562
|
-
context.append(tool_use_message)
|
569
|
+
if decision.is_code_execution_needed and code_session_returning_end_of_turn():
|
570
|
+
continue
|
563
571
|
|
564
572
|
# AI 응답 스트리밍 출력
|
565
|
-
|
566
|
-
response = ""
|
567
|
-
for chunk in chatterer.generate_stream(messages=context):
|
568
|
-
response += chunk
|
569
|
-
console.print(chunk, end="")
|
570
|
-
|
571
|
-
# 전체 응답 처리 후 컨텍스트에 추가
|
572
|
-
lines = response.split("\n")
|
573
|
-
if lines:
|
574
|
-
lines[-1] = lines[-1].rstrip() # 마지막 줄의 오른쪽 공백 제거
|
575
|
-
response = "\n".join(lines).strip()
|
576
|
-
context.append(AIMessage(content=response))
|
577
|
-
console.print() # 응답 후 줄바꿈 추가
|
573
|
+
context.append(AIMessage(content=respond(context)))
|
578
574
|
|
579
575
|
|
580
576
|
if __name__ == "__main__":
|
@@ -6,7 +6,7 @@ from .convert_to_text import (
|
|
6
6
|
pdf_to_text,
|
7
7
|
pyscripts_to_snippets,
|
8
8
|
)
|
9
|
-
from .youtube import
|
9
|
+
from .youtube import get_youtube_video_details, get_youtube_video_subtitle
|
10
10
|
|
11
11
|
|
12
12
|
def init_webpage_to_markdown():
|
@@ -15,6 +15,12 @@ def init_webpage_to_markdown():
|
|
15
15
|
return webpage_to_markdown
|
16
16
|
|
17
17
|
|
18
|
+
def init_upstage_document_parser():
|
19
|
+
from . import upstage_document_parser
|
20
|
+
|
21
|
+
return upstage_document_parser
|
22
|
+
|
23
|
+
|
18
24
|
__all__ = [
|
19
25
|
"html_to_markdown",
|
20
26
|
"anything_to_markdown",
|
@@ -25,4 +31,5 @@ __all__ = [
|
|
25
31
|
"init_webpage_to_markdown",
|
26
32
|
"get_youtube_video_subtitle",
|
27
33
|
"get_youtube_video_details",
|
34
|
+
"init_upstage_document_parser",
|
28
35
|
]
|
@@ -3,14 +3,11 @@ import importlib
|
|
3
3
|
import os
|
4
4
|
import re
|
5
5
|
import site
|
6
|
-
from contextlib import contextmanager, suppress
|
7
6
|
from fnmatch import fnmatch
|
8
|
-
from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
|
9
7
|
from pathlib import Path
|
10
8
|
from typing import (
|
11
9
|
TYPE_CHECKING,
|
12
10
|
Callable,
|
13
|
-
Iterator,
|
14
11
|
NamedTuple,
|
15
12
|
NotRequired,
|
16
13
|
Optional,
|
@@ -20,6 +17,9 @@ from typing import (
|
|
20
17
|
TypedDict,
|
21
18
|
)
|
22
19
|
|
20
|
+
from ..common_types.io import PathOrReadable
|
21
|
+
from ..utils.bytesio import read_bytes_stream
|
22
|
+
|
23
23
|
if TYPE_CHECKING:
|
24
24
|
from bs4 import Tag
|
25
25
|
from openai import OpenAI
|
@@ -38,20 +38,6 @@ type FileTree = dict[str, Optional[FileTree]]
|
|
38
38
|
|
39
39
|
# Type aliases for callback functions and file descriptors
|
40
40
|
CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
|
41
|
-
FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
|
42
|
-
|
43
|
-
# Type aliases for different types of IO objects
|
44
|
-
BytesReadable: TypeAlias = BytesIO | BufferedReader
|
45
|
-
BytesWritable: TypeAlias = BytesIO | BufferedWriter
|
46
|
-
StringReadable: TypeAlias = StringIO | TextIOWrapper
|
47
|
-
StringWritable: TypeAlias = StringIO | TextIOWrapper
|
48
|
-
|
49
|
-
# Combined type aliases for readable and writable objects
|
50
|
-
Readable: TypeAlias = BytesReadable | StringReadable
|
51
|
-
Writable: TypeAlias = BytesWritable | StringWritable
|
52
|
-
|
53
|
-
# Type alias for path or readable object
|
54
|
-
PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
|
55
41
|
|
56
42
|
|
57
43
|
class HtmlToMarkdownOptions(TypedDict):
|
@@ -240,7 +226,7 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
|
|
240
226
|
"""
|
241
227
|
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
242
228
|
|
243
|
-
with
|
229
|
+
with read_bytes_stream(path_or_file) as stream:
|
244
230
|
if stream is None:
|
245
231
|
raise FileNotFoundError(path_or_file)
|
246
232
|
return "\n".join(
|
@@ -430,34 +416,3 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
|
|
430
416
|
if p.is_file()
|
431
417
|
]
|
432
418
|
return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
|
433
|
-
|
434
|
-
|
435
|
-
@contextmanager
|
436
|
-
def _open_stream(
|
437
|
-
path_or_file: PathOrReadable,
|
438
|
-
) -> Iterator[Optional[BytesReadable]]:
|
439
|
-
"""
|
440
|
-
Context manager for opening a file or using an existing stream.
|
441
|
-
|
442
|
-
Handles different types of input (file paths, byte streams, string streams)
|
443
|
-
and yields a BytesReadable object that can be used to read binary data.
|
444
|
-
|
445
|
-
Args:
|
446
|
-
path_or_file: File path or readable object.
|
447
|
-
|
448
|
-
Yields:
|
449
|
-
Optional[BytesReadable]: A readable binary stream or None if opening fails.
|
450
|
-
"""
|
451
|
-
stream: Optional[BytesReadable] = None
|
452
|
-
try:
|
453
|
-
with suppress(BaseException):
|
454
|
-
if isinstance(path_or_file, BytesReadable):
|
455
|
-
stream = path_or_file
|
456
|
-
elif isinstance(path_or_file, StringReadable):
|
457
|
-
stream = BytesIO(path_or_file.read().encode("utf-8"))
|
458
|
-
else:
|
459
|
-
stream = open(path_or_file, "rb")
|
460
|
-
yield stream
|
461
|
-
finally:
|
462
|
-
if stream is not None:
|
463
|
-
stream.close()
|
@@ -0,0 +1,438 @@
|
|
1
|
+
"""Adopted from`langchain_upstage.document_parse"""
|
2
|
+
|
3
|
+
import io
|
4
|
+
import json
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
from typing import Iterator, Literal, Optional, cast
|
8
|
+
|
9
|
+
import requests
|
10
|
+
from langchain_core.document_loaders import BaseBlobParser, Blob
|
11
|
+
from langchain_core.documents import Document
|
12
|
+
from pydantic import BaseModel, Field
|
13
|
+
from pypdf import PdfReader, PdfWriter
|
14
|
+
from pypdf.errors import PdfReadError
|
15
|
+
|
16
|
+
from ..common_types.io import BytesReadable
|
17
|
+
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
18
|
+
from ..utils.image import Base64Image
|
19
|
+
|
20
|
+
logger = logging.getLogger("pypdf")
|
21
|
+
logger.setLevel(logging.ERROR)
|
22
|
+
|
23
|
+
DOCUMENT_PARSE_BASE_URL = "https://api.upstage.ai/v1/document-ai/document-parse"
|
24
|
+
DEFAULT_NUM_PAGES = 10
|
25
|
+
DOCUMENT_PARSE_DEFAULT_MODEL = "document-parse"
|
26
|
+
|
27
|
+
OutputFormat = Literal["text", "html", "markdown"]
|
28
|
+
OCR = Literal["auto", "force"]
|
29
|
+
SplitType = Literal["none", "page", "element"]
|
30
|
+
Category = Literal[
|
31
|
+
"paragraph",
|
32
|
+
"table",
|
33
|
+
"figure",
|
34
|
+
"header",
|
35
|
+
"footer",
|
36
|
+
"caption",
|
37
|
+
"equation",
|
38
|
+
"heading1",
|
39
|
+
"list",
|
40
|
+
"index",
|
41
|
+
"footnote",
|
42
|
+
"chart",
|
43
|
+
]
|
44
|
+
|
45
|
+
|
46
|
+
class Content(BaseModel):
|
47
|
+
text: Optional[str] = None
|
48
|
+
html: Optional[str] = None
|
49
|
+
markdown: Optional[str] = None
|
50
|
+
|
51
|
+
|
52
|
+
class Coordinate(BaseModel):
|
53
|
+
x: float
|
54
|
+
y: float
|
55
|
+
|
56
|
+
|
57
|
+
class Element(BaseModel):
|
58
|
+
category: Category
|
59
|
+
content: Content
|
60
|
+
coordinates: list[Coordinate] = Field(default_factory=list)
|
61
|
+
base64_encoding: str = ""
|
62
|
+
id: int
|
63
|
+
page: int
|
64
|
+
|
65
|
+
def parse_text(self, parser: "UpstageDocumentParseParser") -> str:
|
66
|
+
output_format: OutputFormat = parser.output_format
|
67
|
+
chatterer: Optional[Chatterer] = parser.chatterer
|
68
|
+
image_description_instruction: str = parser.image_description_instruction
|
69
|
+
output: Optional[str] = None
|
70
|
+
if output_format == "text":
|
71
|
+
output = self.content.text
|
72
|
+
elif output_format == "html":
|
73
|
+
output = self.content.html
|
74
|
+
elif output_format == "markdown":
|
75
|
+
output = self.content.markdown
|
76
|
+
if output is None:
|
77
|
+
raise ValueError(f"Invalid output format: {output_format}")
|
78
|
+
|
79
|
+
if chatterer is not None and self.category == "figure" and self.base64_encoding:
|
80
|
+
image = Base64Image.from_string(f"data:image/jpeg;base64,{self.base64_encoding}")
|
81
|
+
if image is None:
|
82
|
+
raise ValueError(f"Invalid base64 encoding for image: {self.base64_encoding}")
|
83
|
+
ocr_content = output.removeprefix("\n")
|
84
|
+
image_description = chatterer.describe_image(
|
85
|
+
image.data_uri,
|
86
|
+
image_description_instruction
|
87
|
+
+ f"\nHint: The OCR detected the following text:\n```\n{ocr_content}\n```",
|
88
|
+
)
|
89
|
+
output = f"\n\n<details>\n{image_description}\n</details>\n\n"
|
90
|
+
|
91
|
+
return output
|
92
|
+
|
93
|
+
|
94
|
+
def get_from_param_or_env(
|
95
|
+
key: str,
|
96
|
+
param: Optional[str] = None,
|
97
|
+
env_key: Optional[str] = None,
|
98
|
+
default: Optional[str] = None,
|
99
|
+
) -> str:
|
100
|
+
"""Get a value from a param or an environment variable."""
|
101
|
+
if param is not None:
|
102
|
+
return param
|
103
|
+
elif env_key and env_key in os.environ and os.environ[env_key]:
|
104
|
+
return os.environ[env_key]
|
105
|
+
elif default is not None:
|
106
|
+
return default
|
107
|
+
else:
|
108
|
+
raise ValueError(
|
109
|
+
f"Did not find {key}, please add an environment variable"
|
110
|
+
f" `{env_key}` which contains it, or pass"
|
111
|
+
f" `{key}` as a named parameter."
|
112
|
+
)
|
113
|
+
|
114
|
+
|
115
|
+
class UpstageDocumentParseParser(BaseBlobParser):
|
116
|
+
"""Upstage Document Parse Parser.
|
117
|
+
|
118
|
+
To use, you should have the environment variable `UPSTAGE_API_KEY`
|
119
|
+
set with your API key or pass it as a named parameter to the constructor.
|
120
|
+
|
121
|
+
Example:
|
122
|
+
.. code-block:: python
|
123
|
+
|
124
|
+
from langchain_upstage import UpstageDocumentParseParser
|
125
|
+
|
126
|
+
loader = UpstageDocumentParseParser(split="page", output_format="text")
|
127
|
+
"""
|
128
|
+
|
129
|
+
def __init__(
|
130
|
+
self,
|
131
|
+
api_key: Optional[str] = None,
|
132
|
+
base_url: str = DOCUMENT_PARSE_BASE_URL,
|
133
|
+
model: str = DOCUMENT_PARSE_DEFAULT_MODEL,
|
134
|
+
split: SplitType = "none",
|
135
|
+
ocr: OCR = "auto",
|
136
|
+
output_format: OutputFormat = "markdown",
|
137
|
+
coordinates: bool = True,
|
138
|
+
base64_encoding: list[Category] = [],
|
139
|
+
chatterer: Optional[Chatterer] = None,
|
140
|
+
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION,
|
141
|
+
) -> None:
|
142
|
+
"""
|
143
|
+
Initializes an instance of the Upstage class.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
api_key (str, optional): The API key for accessing the Upstage API.
|
147
|
+
Defaults to None, in which case it will be
|
148
|
+
fetched from the environment variable
|
149
|
+
`UPSTAGE_API_KEY`.
|
150
|
+
base_url (str, optional): The base URL for accessing the Upstage API.
|
151
|
+
model (str): The model to be used for the document parse.
|
152
|
+
Defaults to "document-parse".
|
153
|
+
split (SplitType, optional): The type of splitting to be applied.
|
154
|
+
Defaults to "none" (no splitting).
|
155
|
+
ocr (OCRMode, optional): Extract text from images in the document using OCR.
|
156
|
+
If the value is "force", OCR is used to extract
|
157
|
+
text from an image. If the value is "auto", text is
|
158
|
+
extracted from a PDF. (An error will occur if the
|
159
|
+
value is "auto" and the input is NOT in PDF format)
|
160
|
+
output_format (OutputFormat, optional): Format of the inference results.
|
161
|
+
coordinates (bool, optional): Whether to include the coordinates of the
|
162
|
+
OCR in the output.
|
163
|
+
base64_encoding (List[Category], optional): The category of the elements to
|
164
|
+
be encoded in base64.
|
165
|
+
chatterer (Chatterer, optional): The Chatterer instance to use for image
|
166
|
+
description.
|
167
|
+
image_description_instruction (str, optional): The instruction to use for
|
168
|
+
image description.
|
169
|
+
|
170
|
+
|
171
|
+
"""
|
172
|
+
self.api_key = get_from_param_or_env(
|
173
|
+
"UPSTAGE_API_KEY",
|
174
|
+
api_key,
|
175
|
+
"UPSTAGE_API_KEY",
|
176
|
+
os.environ.get("UPSTAGE_API_KEY"),
|
177
|
+
)
|
178
|
+
self.base_url = base_url
|
179
|
+
self.model = model
|
180
|
+
self.split: SplitType = split
|
181
|
+
self.ocr: OCR = ocr
|
182
|
+
self.output_format: OutputFormat = output_format
|
183
|
+
self.coordinates = coordinates
|
184
|
+
self.base64_encoding: list[Category] = base64_encoding
|
185
|
+
self.chatterer = chatterer
|
186
|
+
self.image_description_instruction = image_description_instruction
|
187
|
+
|
188
|
+
def _get_response(self, files: dict[str, BytesReadable]) -> list[Element]:
|
189
|
+
"""
|
190
|
+
Sends a POST request to the API endpoint with the provided files and
|
191
|
+
returns the response.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
files (dict): A dictionary containing the files to be sent in the request.
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
dict: The JSON response from the API.
|
198
|
+
|
199
|
+
Raises:
|
200
|
+
ValueError: If there is an error in the API call.
|
201
|
+
"""
|
202
|
+
try:
|
203
|
+
headers = {
|
204
|
+
"Authorization": f"Bearer {self.api_key}",
|
205
|
+
}
|
206
|
+
response = requests.post(
|
207
|
+
self.base_url,
|
208
|
+
headers=headers,
|
209
|
+
files=files,
|
210
|
+
data={
|
211
|
+
"ocr": self.ocr,
|
212
|
+
"model": self.model,
|
213
|
+
"output_formats": f"['{self.output_format}']",
|
214
|
+
"coordinates": self.coordinates,
|
215
|
+
"base64_encoding": f"{self.base64_encoding}",
|
216
|
+
},
|
217
|
+
)
|
218
|
+
response.raise_for_status()
|
219
|
+
result: object = response.json().get("elements", [])
|
220
|
+
if not isinstance(result, list):
|
221
|
+
raise ValueError(f"Failed to parse JSON data: {result}")
|
222
|
+
result = cast(list[object], result)
|
223
|
+
return [Element.model_validate(element) for element in result]
|
224
|
+
except requests.HTTPError as e:
|
225
|
+
raise ValueError(f"HTTP error: {e.response.text}")
|
226
|
+
except requests.RequestException as e:
|
227
|
+
# Handle any request-related exceptions
|
228
|
+
raise ValueError(f"Failed to send request: {e}")
|
229
|
+
except json.JSONDecodeError as e:
|
230
|
+
# Handle JSON decode errors
|
231
|
+
raise ValueError(f"Failed to decode JSON response: {e}")
|
232
|
+
except Exception as e:
|
233
|
+
# Handle any other exceptions
|
234
|
+
raise ValueError(f"An error occurred: {e}")
|
235
|
+
|
236
|
+
def _split_and_request(
|
237
|
+
self, full_docs: PdfReader, start_page: int, num_pages: int = DEFAULT_NUM_PAGES
|
238
|
+
) -> list[Element]:
|
239
|
+
"""
|
240
|
+
Splits the full pdf document into partial pages and sends a request to the
|
241
|
+
server.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
full_docs (PdfReader): The full document to be split and requested.
|
245
|
+
start_page (int): The starting page number for splitting the document.
|
246
|
+
num_pages (int, optional): The number of pages to split the document
|
247
|
+
into.
|
248
|
+
Defaults to DEFAULT_NUMBER_OF_PAGE.
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
response: The response from the server.
|
252
|
+
"""
|
253
|
+
merger = PdfWriter()
|
254
|
+
merger.append(
|
255
|
+
full_docs,
|
256
|
+
pages=(start_page, min(start_page + num_pages, full_docs.get_num_pages())),
|
257
|
+
)
|
258
|
+
|
259
|
+
with io.BytesIO() as buffer:
|
260
|
+
merger.write(buffer)
|
261
|
+
buffer.seek(0)
|
262
|
+
return self._get_response({"document": buffer})
|
263
|
+
|
264
|
+
def _element_document(self, element: Element, start_page: int = 0) -> Document:
|
265
|
+
"""
|
266
|
+
Converts an elements into a Document object.
|
267
|
+
|
268
|
+
Args:
|
269
|
+
elements (Dict) : The elements to convert.
|
270
|
+
start_page (int): The starting page number for splitting the document.
|
271
|
+
This number starts from zero.
|
272
|
+
|
273
|
+
Returns:
|
274
|
+
A list containing a single Document object.
|
275
|
+
|
276
|
+
"""
|
277
|
+
metadata: dict[str, object] = element.model_dump(exclude_none=True)
|
278
|
+
metadata["page"] = element.page + start_page
|
279
|
+
return Document(
|
280
|
+
page_content=element.parse_text(self),
|
281
|
+
metadata=metadata,
|
282
|
+
)
|
283
|
+
|
284
|
+
def _page_document(self, elements: list[Element], start_page: int = 0) -> list[Document]:
|
285
|
+
"""
|
286
|
+
Combines elements with the same page number into a single Document object.
|
287
|
+
|
288
|
+
Args:
|
289
|
+
elements (List): A list of elements containing page numbers.
|
290
|
+
start_page (int): The starting page number for splitting the document.
|
291
|
+
This number starts from zero.
|
292
|
+
|
293
|
+
Returns:
|
294
|
+
List[Document]: A list of Document objects, each representing a page
|
295
|
+
with its content and metadata.
|
296
|
+
"""
|
297
|
+
documents: list[Document] = []
|
298
|
+
pages: list[int] = sorted(set(map(lambda x: x.page, elements)))
|
299
|
+
page_group: list[list[Element]] = [[element for element in elements if element.page == x] for x in pages]
|
300
|
+
for group in page_group:
|
301
|
+
metadata: dict[str, object] = {
|
302
|
+
"page": group[0].page + start_page,
|
303
|
+
}
|
304
|
+
if self.base64_encoding:
|
305
|
+
metadata["base64_encodings"] = [element.base64_encoding for element in group if element.base64_encoding]
|
306
|
+
if self.coordinates:
|
307
|
+
metadata["coordinates"] = [element.coordinates for element in group if element.coordinates]
|
308
|
+
documents.append(
|
309
|
+
Document(
|
310
|
+
page_content=" ".join(element.parse_text(self) for element in group),
|
311
|
+
metadata=metadata,
|
312
|
+
)
|
313
|
+
)
|
314
|
+
|
315
|
+
return documents
|
316
|
+
|
317
|
+
def lazy_parse(self, blob: Blob, is_batch: bool = False) -> Iterator[Document]:
|
318
|
+
"""
|
319
|
+
Lazily parses a document and yields Document objects based on the specified
|
320
|
+
split type.
|
321
|
+
|
322
|
+
Args:
|
323
|
+
blob (Blob): The input document blob to parse.
|
324
|
+
is_batch (bool, optional): Whether to parse the document in batches.
|
325
|
+
Defaults to False (single page parsing)
|
326
|
+
|
327
|
+
Yields:
|
328
|
+
Document: The parsed document object.
|
329
|
+
|
330
|
+
Raises:
|
331
|
+
ValueError: If an invalid split type is provided.
|
332
|
+
|
333
|
+
"""
|
334
|
+
|
335
|
+
if is_batch:
|
336
|
+
num_pages = DEFAULT_NUM_PAGES
|
337
|
+
else:
|
338
|
+
num_pages = 1
|
339
|
+
|
340
|
+
full_docs: Optional[PdfReader] = None
|
341
|
+
try:
|
342
|
+
full_docs = PdfReader(str(blob.path))
|
343
|
+
number_of_pages = full_docs.get_num_pages()
|
344
|
+
except PdfReadError:
|
345
|
+
number_of_pages = 1
|
346
|
+
except Exception as e:
|
347
|
+
raise ValueError(f"Failed to read PDF file: {e}")
|
348
|
+
|
349
|
+
if self.split == "none":
|
350
|
+
result = ""
|
351
|
+
base64_encodings: list[str] = []
|
352
|
+
coordinates: list[list[Coordinate]] = []
|
353
|
+
|
354
|
+
if full_docs is not None:
|
355
|
+
start_page = 0
|
356
|
+
num_pages = DEFAULT_NUM_PAGES
|
357
|
+
for _ in range(number_of_pages):
|
358
|
+
if start_page >= number_of_pages:
|
359
|
+
break
|
360
|
+
|
361
|
+
elements = self._split_and_request(full_docs, start_page, num_pages)
|
362
|
+
for element in elements:
|
363
|
+
result += element.parse_text(self)
|
364
|
+
if self.base64_encoding and (base64_encoding := element.base64_encoding):
|
365
|
+
base64_encodings.append(base64_encoding)
|
366
|
+
if self.coordinates and (coords := element.coordinates):
|
367
|
+
coordinates.append(coords)
|
368
|
+
|
369
|
+
start_page += num_pages
|
370
|
+
|
371
|
+
else:
|
372
|
+
if not blob.path:
|
373
|
+
raise ValueError("Blob path is required for non-PDF files.")
|
374
|
+
|
375
|
+
with open(blob.path, "rb") as f:
|
376
|
+
elements = self._get_response({"document": f})
|
377
|
+
|
378
|
+
for element in elements:
|
379
|
+
result += element.parse_text(self)
|
380
|
+
|
381
|
+
if self.base64_encoding and (base64_encoding := element.base64_encoding):
|
382
|
+
base64_encodings.append(base64_encoding)
|
383
|
+
if self.coordinates and (coords := element.coordinates):
|
384
|
+
coordinates.append(coords)
|
385
|
+
metadata: dict[str, object] = {"total_pages": number_of_pages}
|
386
|
+
if self.coordinates:
|
387
|
+
metadata["coordinates"] = coordinates
|
388
|
+
if self.base64_encoding:
|
389
|
+
metadata["base64_encodings"] = base64_encodings
|
390
|
+
|
391
|
+
yield Document(
|
392
|
+
page_content=result,
|
393
|
+
metadata=metadata,
|
394
|
+
)
|
395
|
+
|
396
|
+
elif self.split == "element":
|
397
|
+
if full_docs is not None:
|
398
|
+
start_page = 0
|
399
|
+
for _ in range(number_of_pages):
|
400
|
+
if start_page >= number_of_pages:
|
401
|
+
break
|
402
|
+
|
403
|
+
elements = self._split_and_request(full_docs, start_page, num_pages)
|
404
|
+
for element in elements:
|
405
|
+
yield self._element_document(element, start_page)
|
406
|
+
|
407
|
+
start_page += num_pages
|
408
|
+
|
409
|
+
else:
|
410
|
+
if not blob.path:
|
411
|
+
raise ValueError("Blob path is required for non-PDF files.")
|
412
|
+
with open(blob.path, "rb") as f:
|
413
|
+
elements = self._get_response({"document": f})
|
414
|
+
|
415
|
+
for element in elements:
|
416
|
+
yield self._element_document(element)
|
417
|
+
|
418
|
+
elif self.split == "page":
|
419
|
+
if full_docs is not None:
|
420
|
+
start_page = 0
|
421
|
+
for _ in range(number_of_pages):
|
422
|
+
if start_page >= number_of_pages:
|
423
|
+
break
|
424
|
+
|
425
|
+
elements = self._split_and_request(full_docs, start_page, num_pages)
|
426
|
+
yield from self._page_document(elements, start_page)
|
427
|
+
|
428
|
+
start_page += num_pages
|
429
|
+
else:
|
430
|
+
if not blob.path:
|
431
|
+
raise ValueError("Blob path is required for non-PDF files.")
|
432
|
+
with open(blob.path, "rb") as f:
|
433
|
+
elements = self._get_response({"document": f})
|
434
|
+
|
435
|
+
yield from self._page_document(elements)
|
436
|
+
|
437
|
+
else:
|
438
|
+
raise ValueError(f"Invalid split type: {self.split}")
|
@@ -277,6 +277,7 @@ def get_image_url_and_markdown_links(
|
|
277
277
|
|
278
278
|
image_data = Base64Image.from_url_or_path(markdown_link.url, headers=headers, config=config)
|
279
279
|
if not image_data:
|
280
|
+
image_matches.setdefault(None, []).append(markdown_link)
|
280
281
|
continue
|
281
282
|
image_matches.setdefault(image_data, []).append(markdown_link)
|
282
283
|
return image_matches
|
@@ -294,6 +295,7 @@ async def aget_image_url_and_markdown_links(
|
|
294
295
|
markdown_link.url, headers=headers, config=config, return_coro=True
|
295
296
|
)
|
296
297
|
if not image_data:
|
298
|
+
image_matches.setdefault(None, []).append(markdown_link)
|
297
299
|
continue
|
298
300
|
image_matches.setdefault(image_data, []).append(markdown_link)
|
299
301
|
return image_matches
|
@@ -306,7 +308,10 @@ def replace_images(
|
|
306
308
|
for image_description, markdown_links in image_description_and_references.items():
|
307
309
|
for markdown_link in markdown_links:
|
308
310
|
if image_description is None:
|
309
|
-
|
311
|
+
if markdown_link.type == "link":
|
312
|
+
replacements.append((markdown_link, markdown_link.link_markdown))
|
313
|
+
elif markdown_link.type == "image":
|
314
|
+
replacements.append((markdown_link, f""))
|
310
315
|
else:
|
311
316
|
replacements.append((
|
312
317
|
markdown_link,
|
@@ -0,0 +1,59 @@
|
|
1
|
+
import os
|
2
|
+
from contextlib import contextmanager, suppress
|
3
|
+
from io import BytesIO
|
4
|
+
from typing import Iterator, Optional
|
5
|
+
|
6
|
+
from ..common_types.io import BytesReadable, PathOrReadable, StringReadable
|
7
|
+
|
8
|
+
|
9
|
+
@contextmanager
|
10
|
+
def read_bytes_stream(
|
11
|
+
path_or_file: PathOrReadable,
|
12
|
+
assume_pathlike_bytes_as_path: bool = False,
|
13
|
+
assume_pathlike_string_as_path: bool = True,
|
14
|
+
) -> Iterator[Optional[BytesReadable]]:
|
15
|
+
"""
|
16
|
+
Context manager for opening a file or using an existing stream.
|
17
|
+
|
18
|
+
Handles different types of input (file paths, byte streams, string streams)
|
19
|
+
and yields a BytesReadable object that can be used to read binary data.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
path_or_file: File path or readable object.
|
23
|
+
assume_pathlike_bytes_as_path: If True, assume bytes-like objects are file paths. Else, treat as data itself.
|
24
|
+
assume_pathlike_string_as_path: If True, assume string-like objects are file paths. Else, treat as data itself.
|
25
|
+
|
26
|
+
Yields:
|
27
|
+
Optional[BytesReadable]: A readable binary stream or None if opening fails.
|
28
|
+
"""
|
29
|
+
stream: Optional[BytesReadable] = None
|
30
|
+
should_close: bool = True # Whether the stream should be closed after use
|
31
|
+
try:
|
32
|
+
with suppress(BaseException):
|
33
|
+
if isinstance(path_or_file, BytesReadable):
|
34
|
+
# Assume the input is already a bytes stream
|
35
|
+
# NOTE: Delivers itself, so shouldn't be closed.
|
36
|
+
stream = path_or_file
|
37
|
+
should_close = False
|
38
|
+
elif isinstance(path_or_file, StringReadable):
|
39
|
+
# Convert the string stream to bytes stream
|
40
|
+
stream = BytesIO(path_or_file.read().encode("utf-8"))
|
41
|
+
elif isinstance(path_or_file, bytes):
|
42
|
+
# Convert the bytes-like object to bytes stream
|
43
|
+
if assume_pathlike_bytes_as_path and os.path.exists(path_or_file):
|
44
|
+
stream = open(path_or_file, "rb")
|
45
|
+
else:
|
46
|
+
stream = BytesIO(path_or_file)
|
47
|
+
elif isinstance(path_or_file, str):
|
48
|
+
# Convert the file path to bytes stream
|
49
|
+
if assume_pathlike_string_as_path and os.path.exists(path_or_file):
|
50
|
+
stream = open(path_or_file, "rb")
|
51
|
+
else:
|
52
|
+
stream = BytesIO(path_or_file.encode("utf-8"))
|
53
|
+
else:
|
54
|
+
# Assume the input is a file descriptor or path
|
55
|
+
stream = open(path_or_file, "rb")
|
56
|
+
yield stream
|
57
|
+
finally:
|
58
|
+
if stream is not None and should_close:
|
59
|
+
stream.close()
|
@@ -3,8 +3,8 @@ from __future__ import annotations
|
|
3
3
|
import re
|
4
4
|
from base64 import b64encode
|
5
5
|
from io import BytesIO
|
6
|
+
from logging import getLogger
|
6
7
|
from pathlib import Path
|
7
|
-
from traceback import print_exc
|
8
8
|
from typing import (
|
9
9
|
Awaitable,
|
10
10
|
ClassVar,
|
@@ -28,6 +28,7 @@ from PIL.Image import Resampling
|
|
28
28
|
from PIL.Image import open as image_open
|
29
29
|
from pydantic import BaseModel
|
30
30
|
|
31
|
+
logger = getLogger(__name__)
|
31
32
|
ImageType: TypeAlias = Literal["jpeg", "jpg", "png", "gif", "webp", "bmp"]
|
32
33
|
|
33
34
|
|
@@ -123,7 +124,10 @@ class Base64Image(BaseModel):
|
|
123
124
|
if return_coro:
|
124
125
|
return cls._afetch_remote_image(url_or_path, headers, config)
|
125
126
|
return cls._fetch_remote_image(url_or_path, headers, config)
|
126
|
-
|
127
|
+
try:
|
128
|
+
return cls._process_local_image(Path(url_or_path), config)
|
129
|
+
except Exception:
|
130
|
+
return None
|
127
131
|
|
128
132
|
@property
|
129
133
|
def data_uri(self) -> str:
|
@@ -167,7 +171,7 @@ class Base64Image(BaseModel):
|
|
167
171
|
max_size_mb = config.get("max_size_mb", float("inf"))
|
168
172
|
image_size_mb = len(image_data) / (1024 * 1024)
|
169
173
|
if image_size_mb > max_size_mb:
|
170
|
-
|
174
|
+
logger.error(f"Image too large: {image_size_mb:.2f} MB > {max_size_mb} MB")
|
171
175
|
return None
|
172
176
|
|
173
177
|
# 2) Pillow로 이미지 열기
|
@@ -182,7 +186,7 @@ class Base64Image(BaseModel):
|
|
182
186
|
# min_largest_side 기준
|
183
187
|
min_largest_side = config.get("min_largest_side", 1)
|
184
188
|
if largest_side < min_largest_side:
|
185
|
-
|
189
|
+
logger.error(f"Image too small: {largest_side} < {min_largest_side}")
|
186
190
|
return None
|
187
191
|
|
188
192
|
# resize 로직
|
@@ -200,7 +204,7 @@ class Base64Image(BaseModel):
|
|
200
204
|
pil_format: str = (im.format or "").lower()
|
201
205
|
allowed_formats: Sequence[ImageType] = config.get("formats", [])
|
202
206
|
if not cls._verify_ext(pil_format, allowed_formats):
|
203
|
-
|
207
|
+
logger.error(f"Invalid format: {pil_format} not in {allowed_formats}")
|
204
208
|
return None
|
205
209
|
|
206
210
|
# 다시 bytes 로 저장
|
@@ -210,7 +214,6 @@ class Base64Image(BaseModel):
|
|
210
214
|
final_bytes = output_buffer.read()
|
211
215
|
|
212
216
|
except Exception:
|
213
|
-
print_exc()
|
214
217
|
return None
|
215
218
|
|
216
219
|
# 최종 base64 인코딩
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chatterer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary: The highest-level interface for various LLM APIs.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -15,18 +15,19 @@ Requires-Dist: markdownify>=1.1.0; extra == "conversion"
|
|
15
15
|
Requires-Dist: commonmark>=0.9.1; extra == "conversion"
|
16
16
|
Requires-Dist: playwright>=1.50.0; extra == "conversion"
|
17
17
|
Requires-Dist: pillow>=11.1.0; extra == "conversion"
|
18
|
-
Requires-Dist: mistune>=3.1.
|
19
|
-
Requires-Dist: markitdown>=0.
|
18
|
+
Requires-Dist: mistune>=3.1.3; extra == "conversion"
|
19
|
+
Requires-Dist: markitdown>=0.1.1; extra == "conversion"
|
20
20
|
Requires-Dist: pymupdf>=1.25.4; extra == "conversion"
|
21
|
-
Requires-Dist: youtube-transcript-api>=1.0.
|
21
|
+
Requires-Dist: youtube-transcript-api>=1.0.3; extra == "conversion"
|
22
|
+
Requires-Dist: pypdf>=5.4.0; extra == "conversion"
|
22
23
|
Provides-Extra: langchain
|
23
24
|
Requires-Dist: chatterer[langchain-providers]; extra == "langchain"
|
24
25
|
Requires-Dist: langchain-experimental>=0.3.4; extra == "langchain"
|
25
26
|
Provides-Extra: langchain-providers
|
26
|
-
Requires-Dist: langchain-openai>=0.3.
|
27
|
-
Requires-Dist: langchain-anthropic>=0.3.
|
28
|
-
Requires-Dist: langchain-google-genai>=2.
|
29
|
-
Requires-Dist: langchain-ollama>=0.
|
27
|
+
Requires-Dist: langchain-openai>=0.3.11; extra == "langchain-providers"
|
28
|
+
Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain-providers"
|
29
|
+
Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain-providers"
|
30
|
+
Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain-providers"
|
30
31
|
Provides-Extra: all
|
31
32
|
Requires-Dist: chatterer[langchain]; extra == "all"
|
32
33
|
Requires-Dist: chatterer[conversion]; extra == "all"
|
@@ -9,11 +9,14 @@ chatterer.egg-info/SOURCES.txt
|
|
9
9
|
chatterer.egg-info/dependency_links.txt
|
10
10
|
chatterer.egg-info/requires.txt
|
11
11
|
chatterer.egg-info/top_level.txt
|
12
|
+
chatterer/common_types/__init__.py
|
13
|
+
chatterer/common_types/io.py
|
12
14
|
chatterer/strategies/__init__.py
|
13
15
|
chatterer/strategies/atom_of_thoughts.py
|
14
16
|
chatterer/strategies/base.py
|
15
17
|
chatterer/tools/__init__.py
|
16
18
|
chatterer/tools/convert_to_text.py
|
19
|
+
chatterer/tools/upstage_document_parser.py
|
17
20
|
chatterer/tools/youtube.py
|
18
21
|
chatterer/tools/citation_chunking/__init__.py
|
19
22
|
chatterer/tools/citation_chunking/chunks.py
|
@@ -26,5 +29,6 @@ chatterer/tools/webpage_to_markdown/__init__.py
|
|
26
29
|
chatterer/tools/webpage_to_markdown/playwright_bot.py
|
27
30
|
chatterer/tools/webpage_to_markdown/utils.py
|
28
31
|
chatterer/utils/__init__.py
|
32
|
+
chatterer/utils/bytesio.py
|
29
33
|
chatterer/utils/code_agent.py
|
30
34
|
chatterer/utils/image.py
|
@@ -11,10 +11,11 @@ markdownify>=1.1.0
|
|
11
11
|
commonmark>=0.9.1
|
12
12
|
playwright>=1.50.0
|
13
13
|
pillow>=11.1.0
|
14
|
-
mistune>=3.1.
|
15
|
-
markitdown>=0.
|
14
|
+
mistune>=3.1.3
|
15
|
+
markitdown>=0.1.1
|
16
16
|
pymupdf>=1.25.4
|
17
|
-
youtube-transcript-api>=1.0.
|
17
|
+
youtube-transcript-api>=1.0.3
|
18
|
+
pypdf>=5.4.0
|
18
19
|
|
19
20
|
[dev]
|
20
21
|
neo4j-extension>=0.1.14
|
@@ -26,7 +27,7 @@ chatterer[langchain-providers]
|
|
26
27
|
langchain-experimental>=0.3.4
|
27
28
|
|
28
29
|
[langchain-providers]
|
29
|
-
langchain-openai>=0.3.
|
30
|
-
langchain-anthropic>=0.3.
|
31
|
-
langchain-google-genai>=2.
|
32
|
-
langchain-ollama>=0.
|
30
|
+
langchain-openai>=0.3.11
|
31
|
+
langchain-anthropic>=0.3.10
|
32
|
+
langchain-google-genai>=2.1.1
|
33
|
+
langchain-ollama>=0.3.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "chatterer"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.13"
|
4
4
|
description = "The highest-level interface for various LLM APIs."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.12"
|
@@ -13,16 +13,17 @@ conversion = [
|
|
13
13
|
"commonmark>=0.9.1",
|
14
14
|
"playwright>=1.50.0",
|
15
15
|
"pillow>=11.1.0",
|
16
|
-
"mistune>=3.1.
|
17
|
-
"markitdown>=0.
|
16
|
+
"mistune>=3.1.3",
|
17
|
+
"markitdown>=0.1.1",
|
18
18
|
"pymupdf>=1.25.4",
|
19
|
-
"youtube-transcript-api>=1.0.
|
19
|
+
"youtube-transcript-api>=1.0.3",
|
20
|
+
"pypdf>=5.4.0",
|
20
21
|
]
|
21
22
|
langchain = ["chatterer[langchain-providers]", "langchain-experimental>=0.3.4"]
|
22
23
|
langchain-providers = [
|
23
|
-
"langchain-openai>=0.3.
|
24
|
-
"langchain-anthropic>=0.3.
|
25
|
-
"langchain-google-genai>=2.
|
26
|
-
"langchain-ollama>=0.
|
24
|
+
"langchain-openai>=0.3.11",
|
25
|
+
"langchain-anthropic>=0.3.10",
|
26
|
+
"langchain-google-genai>=2.1.1",
|
27
|
+
"langchain-ollama>=0.3.0",
|
27
28
|
]
|
28
29
|
all = ["chatterer[langchain]", "chatterer[conversion]", "chatterer[dev]"]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|