chatterer 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/examples/__init__.py +0 -0
- chatterer/examples/anything_to_markdown.py +85 -91
- chatterer/examples/get_code_snippets.py +55 -62
- chatterer/examples/login_with_playwright.py +156 -167
- chatterer/examples/make_ppt.py +488 -497
- chatterer/examples/pdf_to_markdown.py +100 -107
- chatterer/examples/pdf_to_text.py +54 -56
- chatterer/examples/transcription_api.py +112 -123
- chatterer/examples/upstage_parser.py +89 -100
- chatterer/examples/webpage_to_markdown.py +70 -79
- chatterer/interactive.py +354 -354
- chatterer/language_model.py +533 -533
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +393 -302
- chatterer/tools/convert_to_text.py +446 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/METADATA +392 -392
- chatterer-0.1.20.dist-info/RECORD +44 -0
- {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/WHEEL +1 -1
- chatterer-0.1.20.dist-info/entry_points.txt +10 -0
- chatterer-0.1.18.dist-info/RECORD +0 -42
- {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/top_level.txt +0 -0
@@ -1,285 +1,285 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import difflib
|
4
|
-
import logging
|
5
|
-
from typing import NamedTuple, Optional, Self, TypeAlias
|
6
|
-
|
7
|
-
from pydantic import Field
|
8
|
-
from regex import DOTALL
|
9
|
-
from regex import compile as regex_compile
|
10
|
-
from regex import error as regex_error
|
11
|
-
|
12
|
-
from ...language_model import Chatterer
|
13
|
-
from ...messages import HumanMessage
|
14
|
-
from .chunks import CitationChunk
|
15
|
-
from .reference import MultiMatchRegex, Reference, SingleMatchCitation
|
16
|
-
from .utils import MatchedText
|
17
|
-
|
18
|
-
ModelAndSteps: TypeAlias = tuple[Chatterer, int]
|
19
|
-
logger = logging.getLogger(__name__)
|
20
|
-
|
21
|
-
|
22
|
-
class Citations(NamedTuple):
|
23
|
-
"""
|
24
|
-
Holds the verified citation chunks and their matching information.
|
25
|
-
"""
|
26
|
-
|
27
|
-
name: str
|
28
|
-
references: dict[Reference, list[ReferencedTextMatch]]
|
29
|
-
|
30
|
-
@classmethod
|
31
|
-
def from_unverified(
|
32
|
-
cls,
|
33
|
-
unverified_chunk: CitationChunk,
|
34
|
-
document: str,
|
35
|
-
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for refinement
|
36
|
-
) -> Self:
|
37
|
-
subject: str = unverified_chunk.subject
|
38
|
-
self: Self = cls(name=subject, references={})
|
39
|
-
for reference in unverified_chunk.references or ():
|
40
|
-
if isinstance(reference, SingleMatchCitation):
|
41
|
-
try:
|
42
|
-
mt: Optional[ReferencedTextMatch] = ReferencedTextMatch.from_citation(
|
43
|
-
subject=subject,
|
44
|
-
citation=reference,
|
45
|
-
document=document,
|
46
|
-
model_and_refinement_steps=model_and_refinement_steps,
|
47
|
-
)
|
48
|
-
if mt is None or not mt.text.strip():
|
49
|
-
logger.warning(f"Failed to extract text for citation {reference} in subject '{subject}'.")
|
50
|
-
else:
|
51
|
-
self.references[reference] = [mt]
|
52
|
-
except Exception as e:
|
53
|
-
logger.error(f"Error processing citation {reference} for subject '{subject}': {e}")
|
54
|
-
else:
|
55
|
-
try:
|
56
|
-
regex_matches: list[ReferencedTextMatch] = ReferencedTextMatch.from_regex(
|
57
|
-
regex=reference, subject=subject, document=document
|
58
|
-
)
|
59
|
-
if regex_matches:
|
60
|
-
self.references[reference] = regex_matches
|
61
|
-
except regex_error as e:
|
62
|
-
logger.error(f"Regex error for subject '{subject}' with pattern '{reference}': {e}")
|
63
|
-
return self
|
64
|
-
|
65
|
-
|
66
|
-
class ReferencedTextMatch(MatchedText):
|
67
|
-
@classmethod
|
68
|
-
def from_citation(
|
69
|
-
cls,
|
70
|
-
subject: str,
|
71
|
-
citation: SingleMatchCitation,
|
72
|
-
document: str,
|
73
|
-
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for quality-check refinement
|
74
|
-
) -> Optional[Self]:
|
75
|
-
"""
|
76
|
-
Extract text from the document using the adjusted citation indices.
|
77
|
-
Additionally, if a language model is provided, evaluate the extraction quality
|
78
|
-
and refine it if needed.
|
79
|
-
"""
|
80
|
-
citation_id: Optional[SingleMatchCitationWithIndex] = SingleMatchCitationWithIndex.from_indexless_citation(
|
81
|
-
indexless_citation=citation,
|
82
|
-
document=document,
|
83
|
-
subject=subject,
|
84
|
-
model_and_refinement_steps=model_and_refinement_steps,
|
85
|
-
)
|
86
|
-
if citation_id is None:
|
87
|
-
return
|
88
|
-
|
89
|
-
return cls(
|
90
|
-
start_idx=citation_id.start,
|
91
|
-
end_idx=citation_id.end,
|
92
|
-
text=citation_id.extracted_text,
|
93
|
-
)
|
94
|
-
|
95
|
-
@classmethod
|
96
|
-
def from_regex(cls, regex: MultiMatchRegex, subject: str, document: str) -> list[Self]:
|
97
|
-
"""
|
98
|
-
Apply the given regex to the document and return all matching results as a list of MatchedText.
|
99
|
-
"""
|
100
|
-
try:
|
101
|
-
compiled_pattern = regex_compile(regex.regular_expression, flags=DOTALL)
|
102
|
-
except regex_error as e:
|
103
|
-
logger.error(f"Regex compilation error for pattern /{regex.regular_expression}/: {e}")
|
104
|
-
raise e
|
105
|
-
try:
|
106
|
-
matches = list(compiled_pattern.finditer(document, timeout=1.0))
|
107
|
-
except regex_error as e:
|
108
|
-
logger.error(f"Regex matching error for pattern /{regex.regular_expression}/: {e}")
|
109
|
-
raise e
|
110
|
-
return [cls(start_idx=m.start(), end_idx=m.end(), text=m.group()) for m in matches]
|
111
|
-
|
112
|
-
|
113
|
-
class SingleMatchCitationWithIndex(SingleMatchCitation):
|
114
|
-
start: int = Field(description="The computed start index of the citation in the document.")
|
115
|
-
end: int = Field(description="The computed end index of the citation in the document.")
|
116
|
-
extracted_text: str = Field(description="The extracted text from the document using the computed indices.")
|
117
|
-
|
118
|
-
@classmethod
|
119
|
-
def from_indexless_citation(
|
120
|
-
cls,
|
121
|
-
indexless_citation: SingleMatchCitation,
|
122
|
-
document: str,
|
123
|
-
subject: str,
|
124
|
-
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for quality-check refinement
|
125
|
-
) -> Optional[Self]:
|
126
|
-
"""
|
127
|
-
Compute the correct start and end indices for the citation based on the provided text snippets.
|
128
|
-
This method ignores any indices provided by the LLM and computes them using a similarity-based search.
|
129
|
-
If multiple high-scoring candidates are found, the one with the highest effective score is chosen.
|
130
|
-
"""
|
131
|
-
if model_and_refinement_steps is None:
|
132
|
-
model = None
|
133
|
-
num_refinement_steps = 1
|
134
|
-
else:
|
135
|
-
model, num_refinement_steps = model_and_refinement_steps
|
136
|
-
for _ in range(num_refinement_steps):
|
137
|
-
result = cls.from_indexless_citation_with_refinement(
|
138
|
-
indexless_citation=indexless_citation,
|
139
|
-
document=document,
|
140
|
-
subject=subject,
|
141
|
-
chatterer=model,
|
142
|
-
)
|
143
|
-
if result is None:
|
144
|
-
continue
|
145
|
-
return result
|
146
|
-
|
147
|
-
@staticmethod
|
148
|
-
def find_best_match_index(snippet: str, document: str, target_index: int) -> Optional[int]:
|
149
|
-
"""
|
150
|
-
Extracts a candidate window centered around the specified target_index,
|
151
|
-
with a size equal to the length of the snippet. Within this region,
|
152
|
-
it calculates the similarity with the snippet using a sliding window approach.
|
153
|
-
|
154
|
-
The index of the candidate with the highest effective_score is returned.
|
155
|
-
If no suitable candidate is found, the target_index is returned.
|
156
|
-
|
157
|
-
Note: If multiple high-scoring candidates are found, the one with the highest effective score is chosen.
|
158
|
-
"""
|
159
|
-
snippet = snippet.strip()
|
160
|
-
if not snippet:
|
161
|
-
return
|
162
|
-
snippet_len: int = len(snippet)
|
163
|
-
best_index: int = -1
|
164
|
-
best_effective_score = 0.0
|
165
|
-
max_radius = max(target_index, len(document) - target_index)
|
166
|
-
for offset in range(max_radius):
|
167
|
-
for candidate_index in (
|
168
|
-
target_index - offset,
|
169
|
-
target_index + offset,
|
170
|
-
):
|
171
|
-
if candidate_index < 0 or candidate_index + snippet_len > len(document):
|
172
|
-
continue
|
173
|
-
candidate_segment = document[candidate_index : min(candidate_index + snippet_len, len(document))]
|
174
|
-
if len(candidate_segment) < snippet_len:
|
175
|
-
continue
|
176
|
-
local_best_similarity = 0.0
|
177
|
-
local_best_offset = 0
|
178
|
-
for i in range(0, len(candidate_segment) - snippet_len + 1):
|
179
|
-
candidate_window = candidate_segment[i : i + snippet_len]
|
180
|
-
similarity = difflib.SequenceMatcher(None, snippet, candidate_window).ratio()
|
181
|
-
if similarity > local_best_similarity:
|
182
|
-
local_best_similarity = similarity
|
183
|
-
local_best_offset = i
|
184
|
-
candidate_final_index = candidate_index + local_best_offset
|
185
|
-
if candidate_final_index + snippet_len > len(document):
|
186
|
-
candidate_final_index = len(document) - snippet_len
|
187
|
-
if local_best_similarity > best_effective_score:
|
188
|
-
best_effective_score = local_best_similarity
|
189
|
-
best_index = candidate_final_index
|
190
|
-
if not 0 <= best_index < len(document):
|
191
|
-
logger.warning(f"Snippet '{snippet}' not found with sufficient similarity.")
|
192
|
-
return
|
193
|
-
else:
|
194
|
-
logger.debug(
|
195
|
-
f"Found best match for snippet '{snippet}' at index {best_index} with effective score {best_effective_score:.2f}."
|
196
|
-
)
|
197
|
-
return best_index
|
198
|
-
|
199
|
-
@classmethod
|
200
|
-
def from_indexless_citation_with_refinement(
|
201
|
-
cls,
|
202
|
-
indexless_citation: SingleMatchCitation,
|
203
|
-
document: str,
|
204
|
-
subject: str,
|
205
|
-
chatterer: Optional[Chatterer],
|
206
|
-
) -> Optional[Self]:
|
207
|
-
if chatterer is None:
|
208
|
-
logger.error("No LLM provided for indexless citation refinement.")
|
209
|
-
new_indexless_citation = indexless_citation
|
210
|
-
else:
|
211
|
-
new_indexless_citation = chatterer.generate_pydantic(
|
212
|
-
response_model=SingleMatchCitation,
|
213
|
-
messages=[
|
214
|
-
HumanMessage(
|
215
|
-
content=(
|
216
|
-
"I tried to find the `SNIPPET` in the `original-raw-document` to extract a text citation for the subject `subject-to-parse`, but I couldn't find it. "
|
217
|
-
"Please provide `citation-start-from` and `citation-end-at` to help me locate the correct text span.\n"
|
218
|
-
"---\n"
|
219
|
-
"<original-raw-document>\n"
|
220
|
-
f"{document}\n"
|
221
|
-
"</original-raw-document>\n"
|
222
|
-
"---\n"
|
223
|
-
"<subject-to-parse>\n"
|
224
|
-
f"{subject}\n"
|
225
|
-
"</subject-to-parse>\n"
|
226
|
-
"---\n"
|
227
|
-
"<current-citation-start-from>\n"
|
228
|
-
f"{indexless_citation.start_from}\n"
|
229
|
-
"</current-citation-start-from>\n"
|
230
|
-
"---\n"
|
231
|
-
"<current-citation-end-at>\n"
|
232
|
-
f"{indexless_citation.end_at}\n"
|
233
|
-
"</current-citation-end-at>\n"
|
234
|
-
)
|
235
|
-
),
|
236
|
-
],
|
237
|
-
)
|
238
|
-
doc_len: int = len(document)
|
239
|
-
|
240
|
-
start_snippet: str = new_indexless_citation.start_from.strip()
|
241
|
-
if start_snippet:
|
242
|
-
target_for_start = document.find(start_snippet)
|
243
|
-
if target_for_start == -1:
|
244
|
-
target_for_start = 0
|
245
|
-
new_start: Optional[int] = cls.find_best_match_index(
|
246
|
-
snippet=start_snippet,
|
247
|
-
document=document,
|
248
|
-
target_index=target_for_start,
|
249
|
-
)
|
250
|
-
if new_start is None:
|
251
|
-
return
|
252
|
-
else:
|
253
|
-
logger.warning("No start_text provided")
|
254
|
-
return
|
255
|
-
end_snippet: str = new_indexless_citation.end_at.strip()
|
256
|
-
if end_snippet:
|
257
|
-
target_for_end = document.find(end_snippet, new_start)
|
258
|
-
if target_for_end == -1:
|
259
|
-
target_for_end = new_start
|
260
|
-
candidate_end: Optional[int] = cls.find_best_match_index(
|
261
|
-
snippet=end_snippet,
|
262
|
-
document=document,
|
263
|
-
target_index=target_for_end,
|
264
|
-
)
|
265
|
-
if candidate_end is None:
|
266
|
-
return
|
267
|
-
new_end: int = candidate_end + len(end_snippet)
|
268
|
-
else:
|
269
|
-
logger.warning("No end_text provided; defaulting end index to document length.")
|
270
|
-
new_end = doc_len
|
271
|
-
if not 0 <= new_start < new_end <= doc_len:
|
272
|
-
logger.error(f"Adjusted citation indices invalid: start {new_start}, end {new_end}, doc_len {doc_len}.")
|
273
|
-
return
|
274
|
-
try:
|
275
|
-
extracted_text = document[new_start:new_end]
|
276
|
-
except IndexError as e:
|
277
|
-
logger.error(f"Error extracting text using adjusted citation indices: {e}")
|
278
|
-
return
|
279
|
-
return cls(
|
280
|
-
start=new_start,
|
281
|
-
end=new_end,
|
282
|
-
start_from=new_indexless_citation.start_from,
|
283
|
-
end_at=new_indexless_citation.end_at,
|
284
|
-
extracted_text=extracted_text,
|
285
|
-
)
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import difflib
|
4
|
+
import logging
|
5
|
+
from typing import NamedTuple, Optional, Self, TypeAlias
|
6
|
+
|
7
|
+
from pydantic import Field
|
8
|
+
from regex import DOTALL
|
9
|
+
from regex import compile as regex_compile
|
10
|
+
from regex import error as regex_error
|
11
|
+
|
12
|
+
from ...language_model import Chatterer
|
13
|
+
from ...messages import HumanMessage
|
14
|
+
from .chunks import CitationChunk
|
15
|
+
from .reference import MultiMatchRegex, Reference, SingleMatchCitation
|
16
|
+
from .utils import MatchedText
|
17
|
+
|
18
|
+
ModelAndSteps: TypeAlias = tuple[Chatterer, int]
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class Citations(NamedTuple):
|
23
|
+
"""
|
24
|
+
Holds the verified citation chunks and their matching information.
|
25
|
+
"""
|
26
|
+
|
27
|
+
name: str
|
28
|
+
references: dict[Reference, list[ReferencedTextMatch]]
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def from_unverified(
|
32
|
+
cls,
|
33
|
+
unverified_chunk: CitationChunk,
|
34
|
+
document: str,
|
35
|
+
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for refinement
|
36
|
+
) -> Self:
|
37
|
+
subject: str = unverified_chunk.subject
|
38
|
+
self: Self = cls(name=subject, references={})
|
39
|
+
for reference in unverified_chunk.references or ():
|
40
|
+
if isinstance(reference, SingleMatchCitation):
|
41
|
+
try:
|
42
|
+
mt: Optional[ReferencedTextMatch] = ReferencedTextMatch.from_citation(
|
43
|
+
subject=subject,
|
44
|
+
citation=reference,
|
45
|
+
document=document,
|
46
|
+
model_and_refinement_steps=model_and_refinement_steps,
|
47
|
+
)
|
48
|
+
if mt is None or not mt.text.strip():
|
49
|
+
logger.warning(f"Failed to extract text for citation {reference} in subject '{subject}'.")
|
50
|
+
else:
|
51
|
+
self.references[reference] = [mt]
|
52
|
+
except Exception as e:
|
53
|
+
logger.error(f"Error processing citation {reference} for subject '{subject}': {e}")
|
54
|
+
else:
|
55
|
+
try:
|
56
|
+
regex_matches: list[ReferencedTextMatch] = ReferencedTextMatch.from_regex(
|
57
|
+
regex=reference, subject=subject, document=document
|
58
|
+
)
|
59
|
+
if regex_matches:
|
60
|
+
self.references[reference] = regex_matches
|
61
|
+
except regex_error as e:
|
62
|
+
logger.error(f"Regex error for subject '{subject}' with pattern '{reference}': {e}")
|
63
|
+
return self
|
64
|
+
|
65
|
+
|
66
|
+
class ReferencedTextMatch(MatchedText):
|
67
|
+
@classmethod
|
68
|
+
def from_citation(
|
69
|
+
cls,
|
70
|
+
subject: str,
|
71
|
+
citation: SingleMatchCitation,
|
72
|
+
document: str,
|
73
|
+
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for quality-check refinement
|
74
|
+
) -> Optional[Self]:
|
75
|
+
"""
|
76
|
+
Extract text from the document using the adjusted citation indices.
|
77
|
+
Additionally, if a language model is provided, evaluate the extraction quality
|
78
|
+
and refine it if needed.
|
79
|
+
"""
|
80
|
+
citation_id: Optional[SingleMatchCitationWithIndex] = SingleMatchCitationWithIndex.from_indexless_citation(
|
81
|
+
indexless_citation=citation,
|
82
|
+
document=document,
|
83
|
+
subject=subject,
|
84
|
+
model_and_refinement_steps=model_and_refinement_steps,
|
85
|
+
)
|
86
|
+
if citation_id is None:
|
87
|
+
return
|
88
|
+
|
89
|
+
return cls(
|
90
|
+
start_idx=citation_id.start,
|
91
|
+
end_idx=citation_id.end,
|
92
|
+
text=citation_id.extracted_text,
|
93
|
+
)
|
94
|
+
|
95
|
+
@classmethod
|
96
|
+
def from_regex(cls, regex: MultiMatchRegex, subject: str, document: str) -> list[Self]:
|
97
|
+
"""
|
98
|
+
Apply the given regex to the document and return all matching results as a list of MatchedText.
|
99
|
+
"""
|
100
|
+
try:
|
101
|
+
compiled_pattern = regex_compile(regex.regular_expression, flags=DOTALL)
|
102
|
+
except regex_error as e:
|
103
|
+
logger.error(f"Regex compilation error for pattern /{regex.regular_expression}/: {e}")
|
104
|
+
raise e
|
105
|
+
try:
|
106
|
+
matches = list(compiled_pattern.finditer(document, timeout=1.0))
|
107
|
+
except regex_error as e:
|
108
|
+
logger.error(f"Regex matching error for pattern /{regex.regular_expression}/: {e}")
|
109
|
+
raise e
|
110
|
+
return [cls(start_idx=m.start(), end_idx=m.end(), text=m.group()) for m in matches]
|
111
|
+
|
112
|
+
|
113
|
+
class SingleMatchCitationWithIndex(SingleMatchCitation):
|
114
|
+
start: int = Field(description="The computed start index of the citation in the document.")
|
115
|
+
end: int = Field(description="The computed end index of the citation in the document.")
|
116
|
+
extracted_text: str = Field(description="The extracted text from the document using the computed indices.")
|
117
|
+
|
118
|
+
@classmethod
|
119
|
+
def from_indexless_citation(
|
120
|
+
cls,
|
121
|
+
indexless_citation: SingleMatchCitation,
|
122
|
+
document: str,
|
123
|
+
subject: str,
|
124
|
+
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for quality-check refinement
|
125
|
+
) -> Optional[Self]:
|
126
|
+
"""
|
127
|
+
Compute the correct start and end indices for the citation based on the provided text snippets.
|
128
|
+
This method ignores any indices provided by the LLM and computes them using a similarity-based search.
|
129
|
+
If multiple high-scoring candidates are found, the one with the highest effective score is chosen.
|
130
|
+
"""
|
131
|
+
if model_and_refinement_steps is None:
|
132
|
+
model = None
|
133
|
+
num_refinement_steps = 1
|
134
|
+
else:
|
135
|
+
model, num_refinement_steps = model_and_refinement_steps
|
136
|
+
for _ in range(num_refinement_steps):
|
137
|
+
result = cls.from_indexless_citation_with_refinement(
|
138
|
+
indexless_citation=indexless_citation,
|
139
|
+
document=document,
|
140
|
+
subject=subject,
|
141
|
+
chatterer=model,
|
142
|
+
)
|
143
|
+
if result is None:
|
144
|
+
continue
|
145
|
+
return result
|
146
|
+
|
147
|
+
@staticmethod
|
148
|
+
def find_best_match_index(snippet: str, document: str, target_index: int) -> Optional[int]:
|
149
|
+
"""
|
150
|
+
Extracts a candidate window centered around the specified target_index,
|
151
|
+
with a size equal to the length of the snippet. Within this region,
|
152
|
+
it calculates the similarity with the snippet using a sliding window approach.
|
153
|
+
|
154
|
+
The index of the candidate with the highest effective_score is returned.
|
155
|
+
If no suitable candidate is found, the target_index is returned.
|
156
|
+
|
157
|
+
Note: If multiple high-scoring candidates are found, the one with the highest effective score is chosen.
|
158
|
+
"""
|
159
|
+
snippet = snippet.strip()
|
160
|
+
if not snippet:
|
161
|
+
return
|
162
|
+
snippet_len: int = len(snippet)
|
163
|
+
best_index: int = -1
|
164
|
+
best_effective_score = 0.0
|
165
|
+
max_radius = max(target_index, len(document) - target_index)
|
166
|
+
for offset in range(max_radius):
|
167
|
+
for candidate_index in (
|
168
|
+
target_index - offset,
|
169
|
+
target_index + offset,
|
170
|
+
):
|
171
|
+
if candidate_index < 0 or candidate_index + snippet_len > len(document):
|
172
|
+
continue
|
173
|
+
candidate_segment = document[candidate_index : min(candidate_index + snippet_len, len(document))]
|
174
|
+
if len(candidate_segment) < snippet_len:
|
175
|
+
continue
|
176
|
+
local_best_similarity = 0.0
|
177
|
+
local_best_offset = 0
|
178
|
+
for i in range(0, len(candidate_segment) - snippet_len + 1):
|
179
|
+
candidate_window = candidate_segment[i : i + snippet_len]
|
180
|
+
similarity = difflib.SequenceMatcher(None, snippet, candidate_window).ratio()
|
181
|
+
if similarity > local_best_similarity:
|
182
|
+
local_best_similarity = similarity
|
183
|
+
local_best_offset = i
|
184
|
+
candidate_final_index = candidate_index + local_best_offset
|
185
|
+
if candidate_final_index + snippet_len > len(document):
|
186
|
+
candidate_final_index = len(document) - snippet_len
|
187
|
+
if local_best_similarity > best_effective_score:
|
188
|
+
best_effective_score = local_best_similarity
|
189
|
+
best_index = candidate_final_index
|
190
|
+
if not 0 <= best_index < len(document):
|
191
|
+
logger.warning(f"Snippet '{snippet}' not found with sufficient similarity.")
|
192
|
+
return
|
193
|
+
else:
|
194
|
+
logger.debug(
|
195
|
+
f"Found best match for snippet '{snippet}' at index {best_index} with effective score {best_effective_score:.2f}."
|
196
|
+
)
|
197
|
+
return best_index
|
198
|
+
|
199
|
+
@classmethod
|
200
|
+
def from_indexless_citation_with_refinement(
|
201
|
+
cls,
|
202
|
+
indexless_citation: SingleMatchCitation,
|
203
|
+
document: str,
|
204
|
+
subject: str,
|
205
|
+
chatterer: Optional[Chatterer],
|
206
|
+
) -> Optional[Self]:
|
207
|
+
if chatterer is None:
|
208
|
+
logger.error("No LLM provided for indexless citation refinement.")
|
209
|
+
new_indexless_citation = indexless_citation
|
210
|
+
else:
|
211
|
+
new_indexless_citation = chatterer.generate_pydantic(
|
212
|
+
response_model=SingleMatchCitation,
|
213
|
+
messages=[
|
214
|
+
HumanMessage(
|
215
|
+
content=(
|
216
|
+
"I tried to find the `SNIPPET` in the `original-raw-document` to extract a text citation for the subject `subject-to-parse`, but I couldn't find it. "
|
217
|
+
"Please provide `citation-start-from` and `citation-end-at` to help me locate the correct text span.\n"
|
218
|
+
"---\n"
|
219
|
+
"<original-raw-document>\n"
|
220
|
+
f"{document}\n"
|
221
|
+
"</original-raw-document>\n"
|
222
|
+
"---\n"
|
223
|
+
"<subject-to-parse>\n"
|
224
|
+
f"{subject}\n"
|
225
|
+
"</subject-to-parse>\n"
|
226
|
+
"---\n"
|
227
|
+
"<current-citation-start-from>\n"
|
228
|
+
f"{indexless_citation.start_from}\n"
|
229
|
+
"</current-citation-start-from>\n"
|
230
|
+
"---\n"
|
231
|
+
"<current-citation-end-at>\n"
|
232
|
+
f"{indexless_citation.end_at}\n"
|
233
|
+
"</current-citation-end-at>\n"
|
234
|
+
)
|
235
|
+
),
|
236
|
+
],
|
237
|
+
)
|
238
|
+
doc_len: int = len(document)
|
239
|
+
|
240
|
+
start_snippet: str = new_indexless_citation.start_from.strip()
|
241
|
+
if start_snippet:
|
242
|
+
target_for_start = document.find(start_snippet)
|
243
|
+
if target_for_start == -1:
|
244
|
+
target_for_start = 0
|
245
|
+
new_start: Optional[int] = cls.find_best_match_index(
|
246
|
+
snippet=start_snippet,
|
247
|
+
document=document,
|
248
|
+
target_index=target_for_start,
|
249
|
+
)
|
250
|
+
if new_start is None:
|
251
|
+
return
|
252
|
+
else:
|
253
|
+
logger.warning("No start_text provided")
|
254
|
+
return
|
255
|
+
end_snippet: str = new_indexless_citation.end_at.strip()
|
256
|
+
if end_snippet:
|
257
|
+
target_for_end = document.find(end_snippet, new_start)
|
258
|
+
if target_for_end == -1:
|
259
|
+
target_for_end = new_start
|
260
|
+
candidate_end: Optional[int] = cls.find_best_match_index(
|
261
|
+
snippet=end_snippet,
|
262
|
+
document=document,
|
263
|
+
target_index=target_for_end,
|
264
|
+
)
|
265
|
+
if candidate_end is None:
|
266
|
+
return
|
267
|
+
new_end: int = candidate_end + len(end_snippet)
|
268
|
+
else:
|
269
|
+
logger.warning("No end_text provided; defaulting end index to document length.")
|
270
|
+
new_end = doc_len
|
271
|
+
if not 0 <= new_start < new_end <= doc_len:
|
272
|
+
logger.error(f"Adjusted citation indices invalid: start {new_start}, end {new_end}, doc_len {doc_len}.")
|
273
|
+
return
|
274
|
+
try:
|
275
|
+
extracted_text = document[new_start:new_end]
|
276
|
+
except IndexError as e:
|
277
|
+
logger.error(f"Error extracting text using adjusted citation indices: {e}")
|
278
|
+
return
|
279
|
+
return cls(
|
280
|
+
start=new_start,
|
281
|
+
end=new_end,
|
282
|
+
start_from=new_indexless_citation.start_from,
|
283
|
+
end_at=new_indexless_citation.end_at,
|
284
|
+
extracted_text=extracted_text,
|
285
|
+
)
|