chatterer 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +87 -87
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/constants.py +5 -0
- chatterer/examples/__main__.py +75 -75
- chatterer/examples/any2md.py +83 -85
- chatterer/examples/pdf2md.py +231 -338
- chatterer/examples/pdf2txt.py +52 -54
- chatterer/examples/ppt.py +487 -486
- chatterer/examples/pw.py +141 -143
- chatterer/examples/snippet.py +54 -56
- chatterer/examples/transcribe.py +192 -192
- chatterer/examples/upstage.py +87 -89
- chatterer/examples/web2md.py +80 -80
- chatterer/interactive.py +422 -354
- chatterer/language_model.py +530 -536
- chatterer/messages.py +21 -21
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +388 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +51 -53
- chatterer/tools/citation_chunking/citation_chunker.py +117 -118
- chatterer/tools/citation_chunking/citations.py +284 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +636 -645
- chatterer/tools/convert_to_text.py +446 -446
- chatterer/tools/upstage_document_parser.py +704 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -147
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +349 -350
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +145 -145
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/METADATA +377 -390
- chatterer-0.1.27.dist-info/RECORD +43 -0
- chatterer-0.1.26.dist-info/RECORD +0 -42
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/WHEEL +0 -0
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/entry_points.txt +0 -0
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/top_level.txt +0 -0
@@ -1,157 +1,157 @@
|
|
1
|
-
"""
|
2
|
-
ragent/prompt/citation_chunking.py
|
3
|
-
|
4
|
-
This module defines prompt constants for citation chunking.
|
5
|
-
The LLM is expected to return JSON objects that include only the text snippets for the beginning and end of the citation span.
|
6
|
-
The character indices will be computed in a post‐processing step.
|
7
|
-
"""
|
8
|
-
|
9
|
-
from functools import cache
|
10
|
-
|
11
|
-
|
12
|
-
@cache
|
13
|
-
def generate_instruction() -> str:
|
14
|
-
from .chunks import CitationChunk, CitationChunks
|
15
|
-
from .reference import (
|
16
|
-
MultiMatchRegex,
|
17
|
-
SingleMatchCitation,
|
18
|
-
)
|
19
|
-
|
20
|
-
return (
|
21
|
-
"You are an AI specialized in 'citation-based text chunking'.\n"
|
22
|
-
"Given a document, perform the following steps:\n"
|
23
|
-
"1) Identify the major topics in the document.\n"
|
24
|
-
"2) For each topic, provide a list of citation objects indicating the text snippets at the beginning and end of the relevant paragraph(s) for that topic.\n\n"
|
25
|
-
"Important:\n"
|
26
|
-
"- Return citation objects with 'start_text' and 'end_text' fields to precisely capture the text span. Do NOT include character indices.\n"
|
27
|
-
"- If a regular expression based matching is more appropriate for a topic (e.g. for multiple matches), you may include a regex object of type 'multi_match_regex'.\n\n"
|
28
|
-
"Return JSON strictly in the following format:\n"
|
29
|
-
"{json_example}\n\n"
|
30
|
-
"1) Return only valid JSON (no extra keys).\n"
|
31
|
-
"2) Do NOT include any commentary.\n"
|
32
|
-
"3) Ensure that the citations capture the entire relevant paragraph without overlap or omission."
|
33
|
-
).format(
|
34
|
-
json_example=CitationChunks(
|
35
|
-
citation_chunks=[
|
36
|
-
CitationChunk(
|
37
|
-
subject="Quantum Advantage",
|
38
|
-
references=[
|
39
|
-
SingleMatchCitation(
|
40
|
-
start_from="Starting snippet...",
|
41
|
-
end_at="... Ending snippet",
|
42
|
-
),
|
43
|
-
MultiMatchRegex(
|
44
|
-
type="multi_match_regex",
|
45
|
-
regular_expression="Some.*?regex.*?pattern",
|
46
|
-
),
|
47
|
-
],
|
48
|
-
),
|
49
|
-
]
|
50
|
-
).model_dump_json(indent=2)
|
51
|
-
)
|
52
|
-
|
53
|
-
|
54
|
-
@cache
|
55
|
-
def generate_human_assistant_fewshot_examples() -> list[tuple[str, str]]:
|
56
|
-
from .chunks import CitationChunk, CitationChunks
|
57
|
-
from .reference import SingleMatchCitation
|
58
|
-
|
59
|
-
return [
|
60
|
-
(
|
61
|
-
"Agent-Semantic Chunking of the following text:\n\n"
|
62
|
-
"Title: Revolutionary Breakthrough in Quantum Computing\n\n"
|
63
|
-
"In a landmark development, researchers at the National Quantum Laboratory unveiled a quantum computer "
|
64
|
-
"that demonstrates clear quantum advantage by performing computations that are infeasible on classical systems.\n\n"
|
65
|
-
"The breakthrough is the result of years of rigorous research and international collaboration. "
|
66
|
-
"The system leverages entanglement and superposition to process complex algorithms at unprecedented speeds.\n\n"
|
67
|
-
"However, practical applications are still emerging, and experts caution about scalability challenges. "
|
68
|
-
"Meanwhile, several tech giants are expressing keen interest in integrating quantum technology into future products.\n\n"
|
69
|
-
"Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
|
70
|
-
CitationChunks(
|
71
|
-
citation_chunks=[
|
72
|
-
CitationChunk(
|
73
|
-
subject="Quantum Advantage",
|
74
|
-
references=[
|
75
|
-
SingleMatchCitation(
|
76
|
-
start_from="In a landmark development",
|
77
|
-
end_at="on classical systems.",
|
78
|
-
),
|
79
|
-
],
|
80
|
-
),
|
81
|
-
CitationChunk(
|
82
|
-
subject="Research Collaboration",
|
83
|
-
references=[
|
84
|
-
SingleMatchCitation(
|
85
|
-
start_from="The breakthrough is the result",
|
86
|
-
end_at="unprecedented speeds.",
|
87
|
-
),
|
88
|
-
],
|
89
|
-
),
|
90
|
-
CitationChunk(
|
91
|
-
subject="Practical Challenges",
|
92
|
-
references=[
|
93
|
-
SingleMatchCitation(
|
94
|
-
start_from="However, practical applications",
|
95
|
-
end_at="scalability challenges.",
|
96
|
-
),
|
97
|
-
],
|
98
|
-
),
|
99
|
-
CitationChunk(
|
100
|
-
subject="Industry Interest",
|
101
|
-
references=[
|
102
|
-
SingleMatchCitation(
|
103
|
-
start_from="Meanwhile, several tech giants",
|
104
|
-
end_at="future products.",
|
105
|
-
),
|
106
|
-
],
|
107
|
-
),
|
108
|
-
]
|
109
|
-
).model_dump_json(indent=2),
|
110
|
-
),
|
111
|
-
(
|
112
|
-
"Agent-Semantic Chunking of the following text:\n\n"
|
113
|
-
"Title: Rising Seas and Coastal Erosion: A Global Crisis\n\n"
|
114
|
-
"Communities worldwide face the impacts of climate change as rising sea levels lead to accelerated coastal erosion, "
|
115
|
-
"jeopardizing homes and critical infrastructure.\n\n"
|
116
|
-
'In a small coastal town, residents noted that "the encroaching sea" has already begun to claim beachfront properties, '
|
117
|
-
"prompting local authorities to implement emergency measures.\n\n"
|
118
|
-
"Environmental experts warn that without significant intervention, the frequency and severity of these events will increase, "
|
119
|
-
"further exacerbating the global climate crisis.\n\n"
|
120
|
-
"Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
|
121
|
-
CitationChunks(
|
122
|
-
citation_chunks=[
|
123
|
-
CitationChunk(
|
124
|
-
subject="Coastal Erosion Impact",
|
125
|
-
references=[
|
126
|
-
SingleMatchCitation(
|
127
|
-
start_from="Communities worldwide face the impacts",
|
128
|
-
end_at="critical infrastructure.",
|
129
|
-
),
|
130
|
-
],
|
131
|
-
),
|
132
|
-
CitationChunk(
|
133
|
-
subject="Local Emergency Response",
|
134
|
-
references=[
|
135
|
-
SingleMatchCitation(
|
136
|
-
start_from="In a small coastal town",
|
137
|
-
end_at="emergency measures.",
|
138
|
-
),
|
139
|
-
],
|
140
|
-
),
|
141
|
-
CitationChunk(
|
142
|
-
subject="Expert Warning",
|
143
|
-
references=[
|
144
|
-
SingleMatchCitation(
|
145
|
-
start_from="Environmental experts warn",
|
146
|
-
end_at="global climate crisis.",
|
147
|
-
),
|
148
|
-
],
|
149
|
-
),
|
150
|
-
]
|
151
|
-
).model_dump_json(indent=2),
|
152
|
-
),
|
153
|
-
]
|
154
|
-
|
155
|
-
|
156
|
-
def generate_fewshot_affirmative_response() -> str:
|
157
|
-
return "Great! I will now perform the citation-based chunking. Please provide the document to process!"
|
1
|
+
"""
|
2
|
+
ragent/prompt/citation_chunking.py
|
3
|
+
|
4
|
+
This module defines prompt constants for citation chunking.
|
5
|
+
The LLM is expected to return JSON objects that include only the text snippets for the beginning and end of the citation span.
|
6
|
+
The character indices will be computed in a post‐processing step.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from functools import cache
|
10
|
+
|
11
|
+
|
12
|
+
@cache
|
13
|
+
def generate_instruction() -> str:
|
14
|
+
from .chunks import CitationChunk, CitationChunks
|
15
|
+
from .reference import (
|
16
|
+
MultiMatchRegex,
|
17
|
+
SingleMatchCitation,
|
18
|
+
)
|
19
|
+
|
20
|
+
return (
|
21
|
+
"You are an AI specialized in 'citation-based text chunking'.\n"
|
22
|
+
"Given a document, perform the following steps:\n"
|
23
|
+
"1) Identify the major topics in the document.\n"
|
24
|
+
"2) For each topic, provide a list of citation objects indicating the text snippets at the beginning and end of the relevant paragraph(s) for that topic.\n\n"
|
25
|
+
"Important:\n"
|
26
|
+
"- Return citation objects with 'start_text' and 'end_text' fields to precisely capture the text span. Do NOT include character indices.\n"
|
27
|
+
"- If a regular expression based matching is more appropriate for a topic (e.g. for multiple matches), you may include a regex object of type 'multi_match_regex'.\n\n"
|
28
|
+
"Return JSON strictly in the following format:\n"
|
29
|
+
"{json_example}\n\n"
|
30
|
+
"1) Return only valid JSON (no extra keys).\n"
|
31
|
+
"2) Do NOT include any commentary.\n"
|
32
|
+
"3) Ensure that the citations capture the entire relevant paragraph without overlap or omission."
|
33
|
+
).format(
|
34
|
+
json_example=CitationChunks(
|
35
|
+
citation_chunks=[
|
36
|
+
CitationChunk(
|
37
|
+
subject="Quantum Advantage",
|
38
|
+
references=[
|
39
|
+
SingleMatchCitation(
|
40
|
+
start_from="Starting snippet...",
|
41
|
+
end_at="... Ending snippet",
|
42
|
+
),
|
43
|
+
MultiMatchRegex(
|
44
|
+
type="multi_match_regex",
|
45
|
+
regular_expression="Some.*?regex.*?pattern",
|
46
|
+
),
|
47
|
+
],
|
48
|
+
),
|
49
|
+
]
|
50
|
+
).model_dump_json(indent=2)
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
@cache
|
55
|
+
def generate_human_assistant_fewshot_examples() -> list[tuple[str, str]]:
|
56
|
+
from .chunks import CitationChunk, CitationChunks
|
57
|
+
from .reference import SingleMatchCitation
|
58
|
+
|
59
|
+
return [
|
60
|
+
(
|
61
|
+
"Agent-Semantic Chunking of the following text:\n\n"
|
62
|
+
"Title: Revolutionary Breakthrough in Quantum Computing\n\n"
|
63
|
+
"In a landmark development, researchers at the National Quantum Laboratory unveiled a quantum computer "
|
64
|
+
"that demonstrates clear quantum advantage by performing computations that are infeasible on classical systems.\n\n"
|
65
|
+
"The breakthrough is the result of years of rigorous research and international collaboration. "
|
66
|
+
"The system leverages entanglement and superposition to process complex algorithms at unprecedented speeds.\n\n"
|
67
|
+
"However, practical applications are still emerging, and experts caution about scalability challenges. "
|
68
|
+
"Meanwhile, several tech giants are expressing keen interest in integrating quantum technology into future products.\n\n"
|
69
|
+
"Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
|
70
|
+
CitationChunks(
|
71
|
+
citation_chunks=[
|
72
|
+
CitationChunk(
|
73
|
+
subject="Quantum Advantage",
|
74
|
+
references=[
|
75
|
+
SingleMatchCitation(
|
76
|
+
start_from="In a landmark development",
|
77
|
+
end_at="on classical systems.",
|
78
|
+
),
|
79
|
+
],
|
80
|
+
),
|
81
|
+
CitationChunk(
|
82
|
+
subject="Research Collaboration",
|
83
|
+
references=[
|
84
|
+
SingleMatchCitation(
|
85
|
+
start_from="The breakthrough is the result",
|
86
|
+
end_at="unprecedented speeds.",
|
87
|
+
),
|
88
|
+
],
|
89
|
+
),
|
90
|
+
CitationChunk(
|
91
|
+
subject="Practical Challenges",
|
92
|
+
references=[
|
93
|
+
SingleMatchCitation(
|
94
|
+
start_from="However, practical applications",
|
95
|
+
end_at="scalability challenges.",
|
96
|
+
),
|
97
|
+
],
|
98
|
+
),
|
99
|
+
CitationChunk(
|
100
|
+
subject="Industry Interest",
|
101
|
+
references=[
|
102
|
+
SingleMatchCitation(
|
103
|
+
start_from="Meanwhile, several tech giants",
|
104
|
+
end_at="future products.",
|
105
|
+
),
|
106
|
+
],
|
107
|
+
),
|
108
|
+
]
|
109
|
+
).model_dump_json(indent=2),
|
110
|
+
),
|
111
|
+
(
|
112
|
+
"Agent-Semantic Chunking of the following text:\n\n"
|
113
|
+
"Title: Rising Seas and Coastal Erosion: A Global Crisis\n\n"
|
114
|
+
"Communities worldwide face the impacts of climate change as rising sea levels lead to accelerated coastal erosion, "
|
115
|
+
"jeopardizing homes and critical infrastructure.\n\n"
|
116
|
+
'In a small coastal town, residents noted that "the encroaching sea" has already begun to claim beachfront properties, '
|
117
|
+
"prompting local authorities to implement emergency measures.\n\n"
|
118
|
+
"Environmental experts warn that without significant intervention, the frequency and severity of these events will increase, "
|
119
|
+
"further exacerbating the global climate crisis.\n\n"
|
120
|
+
"Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
|
121
|
+
CitationChunks(
|
122
|
+
citation_chunks=[
|
123
|
+
CitationChunk(
|
124
|
+
subject="Coastal Erosion Impact",
|
125
|
+
references=[
|
126
|
+
SingleMatchCitation(
|
127
|
+
start_from="Communities worldwide face the impacts",
|
128
|
+
end_at="critical infrastructure.",
|
129
|
+
),
|
130
|
+
],
|
131
|
+
),
|
132
|
+
CitationChunk(
|
133
|
+
subject="Local Emergency Response",
|
134
|
+
references=[
|
135
|
+
SingleMatchCitation(
|
136
|
+
start_from="In a small coastal town",
|
137
|
+
end_at="emergency measures.",
|
138
|
+
),
|
139
|
+
],
|
140
|
+
),
|
141
|
+
CitationChunk(
|
142
|
+
subject="Expert Warning",
|
143
|
+
references=[
|
144
|
+
SingleMatchCitation(
|
145
|
+
start_from="Environmental experts warn",
|
146
|
+
end_at="global climate crisis.",
|
147
|
+
),
|
148
|
+
],
|
149
|
+
),
|
150
|
+
]
|
151
|
+
).model_dump_json(indent=2),
|
152
|
+
),
|
153
|
+
]
|
154
|
+
|
155
|
+
|
156
|
+
def generate_fewshot_affirmative_response() -> str:
|
157
|
+
return "Great! I will now perform the citation-based chunking. Please provide the document to process!"
|
@@ -1,26 +1,26 @@
|
|
1
|
-
from typing import Literal, TypeAlias
|
2
|
-
|
3
|
-
from pydantic import BaseModel, Field
|
4
|
-
|
5
|
-
|
6
|
-
class MultiMatchRegex(BaseModel):
|
7
|
-
type: Literal["multi_match_regex"] = Field(
|
8
|
-
description="A regex pattern that should match multiple instances of the subject in the document."
|
9
|
-
)
|
10
|
-
regular_expression: str = Field(
|
11
|
-
description="The regex pattern that should match multiple instances of the subject in the document."
|
12
|
-
)
|
13
|
-
|
14
|
-
def __hash__(self) -> int:
|
15
|
-
return hash((self.type, self.regular_expression))
|
16
|
-
|
17
|
-
|
18
|
-
class SingleMatchCitation(BaseModel):
|
19
|
-
start_from: str = Field(description="A snippet of text at the beginning of the cited section.")
|
20
|
-
end_at: str = Field(description="A snippet of text at the end of the cited section.")
|
21
|
-
|
22
|
-
def __hash__(self) -> int:
|
23
|
-
return hash((self.start_from, self.end_at))
|
24
|
-
|
25
|
-
|
26
|
-
Reference: TypeAlias = SingleMatchCitation | MultiMatchRegex
|
1
|
+
from typing import Literal, TypeAlias
|
2
|
+
|
3
|
+
from pydantic import BaseModel, Field
|
4
|
+
|
5
|
+
|
6
|
+
class MultiMatchRegex(BaseModel):
|
7
|
+
type: Literal["multi_match_regex"] = Field(
|
8
|
+
description="A regex pattern that should match multiple instances of the subject in the document."
|
9
|
+
)
|
10
|
+
regular_expression: str = Field(
|
11
|
+
description="The regex pattern that should match multiple instances of the subject in the document."
|
12
|
+
)
|
13
|
+
|
14
|
+
def __hash__(self) -> int:
|
15
|
+
return hash((self.type, self.regular_expression))
|
16
|
+
|
17
|
+
|
18
|
+
class SingleMatchCitation(BaseModel):
|
19
|
+
start_from: str = Field(description="A snippet of text at the beginning of the cited section.")
|
20
|
+
end_at: str = Field(description="A snippet of text at the end of the cited section.")
|
21
|
+
|
22
|
+
def __hash__(self) -> int:
|
23
|
+
return hash((self.start_from, self.end_at))
|
24
|
+
|
25
|
+
|
26
|
+
Reference: TypeAlias = SingleMatchCitation | MultiMatchRegex
|
@@ -1,138 +1,138 @@
|
|
1
|
-
from typing import Callable, NamedTuple, Self, TypeVar
|
2
|
-
|
3
|
-
from pydantic import BaseModel
|
4
|
-
|
5
|
-
T = TypeVar("T", bound=BaseModel)
|
6
|
-
|
7
|
-
|
8
|
-
class MatchedText(NamedTuple):
|
9
|
-
text: str
|
10
|
-
start_idx: int
|
11
|
-
end_idx: int
|
12
|
-
|
13
|
-
@classmethod
|
14
|
-
def from_text(
|
15
|
-
cls,
|
16
|
-
full_text: str,
|
17
|
-
len_func: Callable[[str], int],
|
18
|
-
chunk_size: int = 2048,
|
19
|
-
token_overlap: int = 0,
|
20
|
-
separator: str = "\n",
|
21
|
-
) -> list[Self]:
|
22
|
-
"""
|
23
|
-
토큰 수 제한과 선택적 오버랩을 기준으로 텍스트를 청크로 분할합니다.
|
24
|
-
각 청크는 원본 텍스트 내의 위치 정보 (start_idx, end_idx)와 함께 반환됩니다.
|
25
|
-
텍스트는 separator 문자열로 분할하며, 토큰 수는 len_func 함수를 통해 계산합니다.
|
26
|
-
|
27
|
-
Args:
|
28
|
-
full_text: 분할할 전체 텍스트.
|
29
|
-
len_func: 주어진 텍스트의 토큰 수를 반환하는 함수.
|
30
|
-
chunk_size: 각 청크의 최대 토큰 수. 기본값은 2048.
|
31
|
-
token_overlap: 청크 간 중첩할 토큰 수. 기본값은 0.
|
32
|
-
separator: 텍스트를 분할할 구분자 문자열. 기본값은 "\n".
|
33
|
-
|
34
|
-
Returns:
|
35
|
-
각 요소가 (chunk_text, start_idx, end_idx)인 튜플의 리스트.
|
36
|
-
chunk_text는 whole_text 내에서 whole_text[start_idx:end_idx]와 동일한 부분 문자열입니다.
|
37
|
-
"""
|
38
|
-
text_chunks: list[Self] = []
|
39
|
-
sep_token_count: int = len_func(separator)
|
40
|
-
sep_len = len(separator)
|
41
|
-
|
42
|
-
# 먼저, separator를 기준으로 원본 텍스트를 분할하되 각 조각의 시작/종료 인덱스를 기록합니다.
|
43
|
-
piece_infos: list[Self] = [] # 각 튜플: (piece_text, start_index, end_index)
|
44
|
-
start_idx = 0
|
45
|
-
while True:
|
46
|
-
idx = full_text.find(separator, start_idx)
|
47
|
-
if idx == -1:
|
48
|
-
# 마지막 조각: separator가 더 이상 없으므로 전체 남은 부분을 추가합니다.
|
49
|
-
piece_infos.append(
|
50
|
-
cls(
|
51
|
-
text=full_text[start_idx:],
|
52
|
-
start_idx=start_idx,
|
53
|
-
end_idx=len(full_text),
|
54
|
-
)
|
55
|
-
)
|
56
|
-
break
|
57
|
-
else:
|
58
|
-
piece_infos.append(
|
59
|
-
cls(
|
60
|
-
text=full_text[start_idx:idx],
|
61
|
-
start_idx=start_idx,
|
62
|
-
end_idx=idx,
|
63
|
-
)
|
64
|
-
)
|
65
|
-
start_idx = idx + sep_len
|
66
|
-
|
67
|
-
current_chunk: list[Self] = []
|
68
|
-
current_token_count: int = 0
|
69
|
-
i = 0
|
70
|
-
while i < len(piece_infos):
|
71
|
-
piece_info = piece_infos[i]
|
72
|
-
piece = piece_info.text
|
73
|
-
piece_start = piece_info.start_idx
|
74
|
-
piece_end = piece_info.end_idx
|
75
|
-
# 원래 코드는 각 조각에 separator의 토큰 수도 포함합니다.
|
76
|
-
piece_token_count: int = len_func(piece) + sep_token_count
|
77
|
-
|
78
|
-
# 현재 청크에 추가하면 chunk_size를 초과하는 경우
|
79
|
-
if current_token_count + piece_token_count > chunk_size:
|
80
|
-
# 단일 조각이 chunk_size보다 큰 경우엔 어쩔 수 없이 추가합니다.
|
81
|
-
if not current_chunk:
|
82
|
-
current_chunk.append(
|
83
|
-
cls(
|
84
|
-
text=piece,
|
85
|
-
start_idx=piece_start,
|
86
|
-
end_idx=piece_end,
|
87
|
-
)
|
88
|
-
)
|
89
|
-
current_token_count += piece_token_count
|
90
|
-
i += 1
|
91
|
-
# 현재 청크 완성 → 청크에 추가
|
92
|
-
chunk_start = current_chunk[0].start_idx
|
93
|
-
# current_chunk에 담긴 조각들은 원본 텍스트상 연속되어 있으므로,
|
94
|
-
# 청크의 종료 인덱스는 마지막 조각의 end_index가 됩니다.
|
95
|
-
chunk_end = current_chunk[-1].end_idx
|
96
|
-
# 원본 텍스트의 해당 구간을 그대로 추출하면 separator가 포함됩니다.
|
97
|
-
chunk_text = full_text[chunk_start:chunk_end]
|
98
|
-
text_chunks.append(
|
99
|
-
cls(
|
100
|
-
text=chunk_text,
|
101
|
-
start_idx=chunk_start,
|
102
|
-
end_idx=chunk_end,
|
103
|
-
)
|
104
|
-
)
|
105
|
-
|
106
|
-
# token_overlap이 적용되는 경우: 청크 끝부분 일부를 다음 청크에 오버랩합니다.
|
107
|
-
if token_overlap > 0:
|
108
|
-
overlap_chunk: list[Self] = []
|
109
|
-
overlap_count: int = 0
|
110
|
-
# 뒤에서부터 역순으로 오버랩할 조각들을 선택합니다.
|
111
|
-
for j in range(len(current_chunk) - 1, -1, -1):
|
112
|
-
p_text = current_chunk[j].text
|
113
|
-
p_token_count = len_func(p_text) + sep_token_count
|
114
|
-
# 최소 한 조각은 포함하고, 오버랩 토큰 수가 token_overlap 이하라면 계속 추가
|
115
|
-
if overlap_count + p_token_count <= token_overlap or not overlap_chunk:
|
116
|
-
overlap_chunk.insert(0, current_chunk[j])
|
117
|
-
overlap_count += p_token_count
|
118
|
-
else:
|
119
|
-
break
|
120
|
-
current_chunk = overlap_chunk.copy()
|
121
|
-
current_token_count = overlap_count
|
122
|
-
else:
|
123
|
-
current_chunk.clear()
|
124
|
-
current_token_count = 0
|
125
|
-
else:
|
126
|
-
# 청크에 추가 후 다음 조각 진행
|
127
|
-
current_chunk.append(cls(text=piece, start_idx=piece_start, end_idx=piece_end))
|
128
|
-
current_token_count += piece_token_count
|
129
|
-
i += 1
|
130
|
-
|
131
|
-
# 남은 조각이 있다면 마지막 청크로 추가합니다.
|
132
|
-
if current_chunk:
|
133
|
-
chunk_start = current_chunk[0].start_idx
|
134
|
-
chunk_end = current_chunk[-1].end_idx
|
135
|
-
chunk_text = full_text[chunk_start:chunk_end]
|
136
|
-
text_chunks.append(cls(text=chunk_text, start_idx=chunk_start, end_idx=chunk_end))
|
137
|
-
|
138
|
-
return text_chunks
|
1
|
+
from typing import Callable, NamedTuple, Self, TypeVar
|
2
|
+
|
3
|
+
from pydantic import BaseModel
|
4
|
+
|
5
|
+
T = TypeVar("T", bound=BaseModel)
|
6
|
+
|
7
|
+
|
8
|
+
class MatchedText(NamedTuple):
|
9
|
+
text: str
|
10
|
+
start_idx: int
|
11
|
+
end_idx: int
|
12
|
+
|
13
|
+
@classmethod
|
14
|
+
def from_text(
|
15
|
+
cls,
|
16
|
+
full_text: str,
|
17
|
+
len_func: Callable[[str], int],
|
18
|
+
chunk_size: int = 2048,
|
19
|
+
token_overlap: int = 0,
|
20
|
+
separator: str = "\n",
|
21
|
+
) -> list[Self]:
|
22
|
+
"""
|
23
|
+
토큰 수 제한과 선택적 오버랩을 기준으로 텍스트를 청크로 분할합니다.
|
24
|
+
각 청크는 원본 텍스트 내의 위치 정보 (start_idx, end_idx)와 함께 반환됩니다.
|
25
|
+
텍스트는 separator 문자열로 분할하며, 토큰 수는 len_func 함수를 통해 계산합니다.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
full_text: 분할할 전체 텍스트.
|
29
|
+
len_func: 주어진 텍스트의 토큰 수를 반환하는 함수.
|
30
|
+
chunk_size: 각 청크의 최대 토큰 수. 기본값은 2048.
|
31
|
+
token_overlap: 청크 간 중첩할 토큰 수. 기본값은 0.
|
32
|
+
separator: 텍스트를 분할할 구분자 문자열. 기본값은 "\n".
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
각 요소가 (chunk_text, start_idx, end_idx)인 튜플의 리스트.
|
36
|
+
chunk_text는 whole_text 내에서 whole_text[start_idx:end_idx]와 동일한 부분 문자열입니다.
|
37
|
+
"""
|
38
|
+
text_chunks: list[Self] = []
|
39
|
+
sep_token_count: int = len_func(separator)
|
40
|
+
sep_len = len(separator)
|
41
|
+
|
42
|
+
# 먼저, separator를 기준으로 원본 텍스트를 분할하되 각 조각의 시작/종료 인덱스를 기록합니다.
|
43
|
+
piece_infos: list[Self] = [] # 각 튜플: (piece_text, start_index, end_index)
|
44
|
+
start_idx = 0
|
45
|
+
while True:
|
46
|
+
idx = full_text.find(separator, start_idx)
|
47
|
+
if idx == -1:
|
48
|
+
# 마지막 조각: separator가 더 이상 없으므로 전체 남은 부분을 추가합니다.
|
49
|
+
piece_infos.append(
|
50
|
+
cls(
|
51
|
+
text=full_text[start_idx:],
|
52
|
+
start_idx=start_idx,
|
53
|
+
end_idx=len(full_text),
|
54
|
+
)
|
55
|
+
)
|
56
|
+
break
|
57
|
+
else:
|
58
|
+
piece_infos.append(
|
59
|
+
cls(
|
60
|
+
text=full_text[start_idx:idx],
|
61
|
+
start_idx=start_idx,
|
62
|
+
end_idx=idx,
|
63
|
+
)
|
64
|
+
)
|
65
|
+
start_idx = idx + sep_len
|
66
|
+
|
67
|
+
current_chunk: list[Self] = []
|
68
|
+
current_token_count: int = 0
|
69
|
+
i = 0
|
70
|
+
while i < len(piece_infos):
|
71
|
+
piece_info = piece_infos[i]
|
72
|
+
piece = piece_info.text
|
73
|
+
piece_start = piece_info.start_idx
|
74
|
+
piece_end = piece_info.end_idx
|
75
|
+
# 원래 코드는 각 조각에 separator의 토큰 수도 포함합니다.
|
76
|
+
piece_token_count: int = len_func(piece) + sep_token_count
|
77
|
+
|
78
|
+
# 현재 청크에 추가하면 chunk_size를 초과하는 경우
|
79
|
+
if current_token_count + piece_token_count > chunk_size:
|
80
|
+
# 단일 조각이 chunk_size보다 큰 경우엔 어쩔 수 없이 추가합니다.
|
81
|
+
if not current_chunk:
|
82
|
+
current_chunk.append(
|
83
|
+
cls(
|
84
|
+
text=piece,
|
85
|
+
start_idx=piece_start,
|
86
|
+
end_idx=piece_end,
|
87
|
+
)
|
88
|
+
)
|
89
|
+
current_token_count += piece_token_count
|
90
|
+
i += 1
|
91
|
+
# 현재 청크 완성 → 청크에 추가
|
92
|
+
chunk_start = current_chunk[0].start_idx
|
93
|
+
# current_chunk에 담긴 조각들은 원본 텍스트상 연속되어 있으므로,
|
94
|
+
# 청크의 종료 인덱스는 마지막 조각의 end_index가 됩니다.
|
95
|
+
chunk_end = current_chunk[-1].end_idx
|
96
|
+
# 원본 텍스트의 해당 구간을 그대로 추출하면 separator가 포함됩니다.
|
97
|
+
chunk_text = full_text[chunk_start:chunk_end]
|
98
|
+
text_chunks.append(
|
99
|
+
cls(
|
100
|
+
text=chunk_text,
|
101
|
+
start_idx=chunk_start,
|
102
|
+
end_idx=chunk_end,
|
103
|
+
)
|
104
|
+
)
|
105
|
+
|
106
|
+
# token_overlap이 적용되는 경우: 청크 끝부분 일부를 다음 청크에 오버랩합니다.
|
107
|
+
if token_overlap > 0:
|
108
|
+
overlap_chunk: list[Self] = []
|
109
|
+
overlap_count: int = 0
|
110
|
+
# 뒤에서부터 역순으로 오버랩할 조각들을 선택합니다.
|
111
|
+
for j in range(len(current_chunk) - 1, -1, -1):
|
112
|
+
p_text = current_chunk[j].text
|
113
|
+
p_token_count = len_func(p_text) + sep_token_count
|
114
|
+
# 최소 한 조각은 포함하고, 오버랩 토큰 수가 token_overlap 이하라면 계속 추가
|
115
|
+
if overlap_count + p_token_count <= token_overlap or not overlap_chunk:
|
116
|
+
overlap_chunk.insert(0, current_chunk[j])
|
117
|
+
overlap_count += p_token_count
|
118
|
+
else:
|
119
|
+
break
|
120
|
+
current_chunk = overlap_chunk.copy()
|
121
|
+
current_token_count = overlap_count
|
122
|
+
else:
|
123
|
+
current_chunk.clear()
|
124
|
+
current_token_count = 0
|
125
|
+
else:
|
126
|
+
# 청크에 추가 후 다음 조각 진행
|
127
|
+
current_chunk.append(cls(text=piece, start_idx=piece_start, end_idx=piece_end))
|
128
|
+
current_token_count += piece_token_count
|
129
|
+
i += 1
|
130
|
+
|
131
|
+
# 남은 조각이 있다면 마지막 청크로 추가합니다.
|
132
|
+
if current_chunk:
|
133
|
+
chunk_start = current_chunk[0].start_idx
|
134
|
+
chunk_end = current_chunk[-1].end_idx
|
135
|
+
chunk_text = full_text[chunk_start:chunk_end]
|
136
|
+
text_chunks.append(cls(text=chunk_text, start_idx=chunk_start, end_idx=chunk_end))
|
137
|
+
|
138
|
+
return text_chunks
|