chatterer 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +39 -27
- chatterer/language_model.py +371 -327
- chatterer/messages.py +8 -0
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -976
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +17 -15
- chatterer/tools/citation_chunking/__init__.py +3 -0
- chatterer/tools/citation_chunking/chunks.py +53 -0
- chatterer/tools/citation_chunking/citation_chunker.py +118 -0
- chatterer/tools/citation_chunking/citations.py +285 -0
- chatterer/tools/citation_chunking/prompt.py +157 -0
- chatterer/tools/citation_chunking/reference.py +26 -0
- chatterer/tools/citation_chunking/utils.py +138 -0
- chatterer/tools/convert_to_text.py +466 -464
- chatterer/tools/webpage_to_markdown/__init__.py +4 -4
- chatterer/tools/webpage_to_markdown/playwright_bot.py +649 -631
- chatterer/tools/webpage_to_markdown/utils.py +329 -556
- chatterer/utils/image.py +284 -0
- {chatterer-0.1.6.dist-info → chatterer-0.1.7.dist-info}/METADATA +166 -166
- chatterer-0.1.7.dist-info/RECORD +24 -0
- {chatterer-0.1.6.dist-info → chatterer-0.1.7.dist-info}/WHEEL +1 -1
- chatterer-0.1.6.dist-info/RECORD +0 -15
- {chatterer-0.1.6.dist-info → chatterer-0.1.7.dist-info}/top_level.txt +0 -0
chatterer/strategies/base.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
|
3
|
-
from ..language_model import LanguageModelInput
|
4
|
-
|
5
|
-
|
6
|
-
class BaseStrategy(ABC):
|
7
|
-
@abstractmethod
|
8
|
-
def invoke(self, messages: LanguageModelInput) -> str:
|
9
|
-
"""
|
10
|
-
Invoke the strategy with the given messages.
|
11
|
-
|
12
|
-
messages: List of messages to be passed to the strategy.
|
13
|
-
e.g. [{"role": "user", "content": "What is the meaning of life?"}]
|
14
|
-
"""
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
|
3
|
+
from ..language_model import LanguageModelInput
|
4
|
+
|
5
|
+
|
6
|
+
class BaseStrategy(ABC):
|
7
|
+
@abstractmethod
|
8
|
+
def invoke(self, messages: LanguageModelInput) -> str:
|
9
|
+
"""
|
10
|
+
Invoke the strategy with the given messages.
|
11
|
+
|
12
|
+
messages: List of messages to be passed to the strategy.
|
13
|
+
e.g. [{"role": "user", "content": "What is the meaning of life?"}]
|
14
|
+
"""
|
chatterer/tools/__init__.py
CHANGED
@@ -1,15 +1,17 @@
|
|
1
|
-
from .
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
"
|
12
|
-
"
|
13
|
-
"
|
14
|
-
"
|
15
|
-
|
1
|
+
from .citation_chunking import citation_chunker
|
2
|
+
from .convert_to_text import (
|
3
|
+
anything_to_markdown,
|
4
|
+
get_default_html_to_markdown_options,
|
5
|
+
html_to_markdown,
|
6
|
+
pdf_to_text,
|
7
|
+
pyscripts_to_snippets,
|
8
|
+
)
|
9
|
+
|
10
|
+
__all__ = [
|
11
|
+
"html_to_markdown",
|
12
|
+
"anything_to_markdown",
|
13
|
+
"pdf_to_text",
|
14
|
+
"get_default_html_to_markdown_options",
|
15
|
+
"pyscripts_to_snippets",
|
16
|
+
"citation_chunker",
|
17
|
+
]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Callable, Optional, Self
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
|
6
|
+
from ...language_model import Chatterer
|
7
|
+
from ...messages import AIMessage, BaseMessage, HumanMessage
|
8
|
+
from .prompt import (
|
9
|
+
generate_fewshot_affirmative_response,
|
10
|
+
generate_human_assistant_fewshot_examples,
|
11
|
+
generate_instruction,
|
12
|
+
)
|
13
|
+
from .reference import Reference
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class CitationChunk(BaseModel):
|
19
|
+
subject: str = Field(description="The main topic or subject that the citations capture.")
|
20
|
+
references: list[Reference] = Field(description="A list of citation objects and/or regex patterns for the subject.")
|
21
|
+
|
22
|
+
|
23
|
+
class CitationChunks(BaseModel):
|
24
|
+
citation_chunks: list[CitationChunk] = Field(
|
25
|
+
description="A list of citation chunks, each capturing a specific topic in the document."
|
26
|
+
)
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def from_llm(
|
30
|
+
cls,
|
31
|
+
chatterer: Chatterer,
|
32
|
+
document: str,
|
33
|
+
fewshot_examples_generator: Optional[
|
34
|
+
Callable[[], list[tuple[str, str]]]
|
35
|
+
] = generate_human_assistant_fewshot_examples,
|
36
|
+
instruction_generator: Optional[Callable[[], str]] = generate_instruction,
|
37
|
+
fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
|
38
|
+
) -> Self:
|
39
|
+
messages: list[BaseMessage] = []
|
40
|
+
if instruction_generator:
|
41
|
+
messages.append(HumanMessage(content=instruction_generator()))
|
42
|
+
if fewshot_examples_generator is not None:
|
43
|
+
if fewshot_affirmative_response:
|
44
|
+
messages.append(AIMessage(content=generate_fewshot_affirmative_response()))
|
45
|
+
for human_ask, ai_answer in fewshot_examples_generator():
|
46
|
+
messages.append(HumanMessage(content=human_ask))
|
47
|
+
messages.append(AIMessage(content=ai_answer))
|
48
|
+
messages.append(HumanMessage(content=document))
|
49
|
+
try:
|
50
|
+
return chatterer.generate_pydantic(response_model=cls, messages=messages)
|
51
|
+
except Exception as e:
|
52
|
+
logger.error(f"Error obtaining CitationChunks from LLM: {e}")
|
53
|
+
raise e
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Callable, NamedTuple, Optional, Self
|
3
|
+
|
4
|
+
import colorama
|
5
|
+
from colorama import Fore
|
6
|
+
|
7
|
+
from ...language_model import Chatterer
|
8
|
+
from .chunks import CitationChunks
|
9
|
+
from .citations import Citations
|
10
|
+
from .prompt import (
|
11
|
+
generate_fewshot_affirmative_response,
|
12
|
+
generate_human_assistant_fewshot_examples,
|
13
|
+
generate_instruction,
|
14
|
+
)
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
colorama.init()
|
18
|
+
|
19
|
+
|
20
|
+
class GlobalCoverage(NamedTuple):
|
21
|
+
coverage: float
|
22
|
+
matched_intervals: list[tuple[int, int]]
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def merge_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]:
|
26
|
+
if not intervals:
|
27
|
+
return []
|
28
|
+
sorted_intervals = sorted(intervals, key=lambda x: x[0])
|
29
|
+
merged: list[tuple[int, int]] = [sorted_intervals[0]]
|
30
|
+
for current in sorted_intervals[1:]:
|
31
|
+
prev = merged[-1]
|
32
|
+
if current[0] <= prev[1]:
|
33
|
+
merged[-1] = (prev[0], max(prev[1], current[1]))
|
34
|
+
else:
|
35
|
+
merged.append(current)
|
36
|
+
return merged
|
37
|
+
|
38
|
+
@classmethod
|
39
|
+
def from_verified_citations(cls, verified_chunks: list[Citations], document: str) -> Self:
|
40
|
+
all_intervals: list[tuple[int, int]] = []
|
41
|
+
for chunk in verified_chunks:
|
42
|
+
for matches in chunk.references.values():
|
43
|
+
for m in matches:
|
44
|
+
all_intervals.append((m.start_idx, m.end_idx))
|
45
|
+
merged: list[tuple[int, int]] = cls.merge_intervals(all_intervals)
|
46
|
+
doc_length: int = len(document)
|
47
|
+
total_matched = sum((e - s for s, e in merged))
|
48
|
+
coverage: float = total_matched / doc_length if doc_length > 0 else 0.0
|
49
|
+
return cls(coverage=coverage, matched_intervals=merged)
|
50
|
+
|
51
|
+
|
52
|
+
def citation_chunker(
|
53
|
+
document: str,
|
54
|
+
chatterer: Chatterer,
|
55
|
+
global_coverage_threshold: float = 0.9,
|
56
|
+
num_refinement_steps: int = 3,
|
57
|
+
fewshot_examples_generator: Optional[
|
58
|
+
Callable[[], list[tuple[str, str]]]
|
59
|
+
] = generate_human_assistant_fewshot_examples,
|
60
|
+
instruction_generator: Optional[Callable[[], str]] = generate_instruction,
|
61
|
+
fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
|
62
|
+
test_global_coverage: bool = False,
|
63
|
+
) -> list[Citations]:
|
64
|
+
"""
|
65
|
+
1) Obtain CitationChunks via the LLM.
|
66
|
+
2) Process each chunk to extract MatchedText using snippet-based index correction.
|
67
|
+
3) Calculate overall document coverage and print results.
|
68
|
+
"""
|
69
|
+
unverified_chunks: CitationChunks = CitationChunks.from_llm(
|
70
|
+
chatterer=chatterer,
|
71
|
+
document=document,
|
72
|
+
fewshot_examples_generator=fewshot_examples_generator,
|
73
|
+
instruction_generator=instruction_generator,
|
74
|
+
fewshot_affirmative_response=fewshot_affirmative_response,
|
75
|
+
)
|
76
|
+
|
77
|
+
verified_chunks: list[Citations] = []
|
78
|
+
for chunk in unverified_chunks.citation_chunks:
|
79
|
+
try:
|
80
|
+
vc: Citations = Citations.from_unverified(
|
81
|
+
unverified_chunk=chunk,
|
82
|
+
document=document,
|
83
|
+
model_and_refinement_steps=(chatterer, num_refinement_steps),
|
84
|
+
)
|
85
|
+
verified_chunks.append(vc)
|
86
|
+
except Exception as e:
|
87
|
+
logger.error(f"Error processing chunk for subject '{chunk.subject}': {e}")
|
88
|
+
|
89
|
+
if test_global_coverage:
|
90
|
+
gc = GlobalCoverage.from_verified_citations(verified_chunks, document)
|
91
|
+
logger.info(f"Global coverage: {gc.coverage * 100:.1f}%")
|
92
|
+
if gc.coverage < global_coverage_threshold:
|
93
|
+
logger.info(
|
94
|
+
f"Global coverage {gc.coverage * 100:.1f}% is below the threshold {global_coverage_threshold * 100:.1f}%."
|
95
|
+
)
|
96
|
+
print("=== Final Global Coverage Check ===")
|
97
|
+
print(f"Overall coverage: {gc.coverage * 100:.1f}% of the document.")
|
98
|
+
if gc.matched_intervals:
|
99
|
+
print("Merged matched intervals:")
|
100
|
+
for interval in gc.matched_intervals:
|
101
|
+
print(f" - {interval}")
|
102
|
+
else:
|
103
|
+
print("No matches found across all chunks.")
|
104
|
+
print("\n=== Raw Semantic Chunking Result ===")
|
105
|
+
for vc in verified_chunks:
|
106
|
+
print(f"{Fore.LIGHTGREEN_EX}[SUBJECT] {Fore.GREEN}{vc.name}{Fore.RESET}")
|
107
|
+
if vc.references:
|
108
|
+
for source_key, matches in vc.references.items():
|
109
|
+
print(f"{Fore.LIGHTBLUE_EX} [SOURCE] {Fore.BLUE}{source_key}{Fore.RESET}")
|
110
|
+
for mt in matches:
|
111
|
+
snippet = repr(mt.text)
|
112
|
+
print(
|
113
|
+
f" {Fore.LIGHTYELLOW_EX}[MATCH @ {mt.start_idx}~{mt.end_idx}] {Fore.YELLOW}{snippet}{Fore.RESET}"
|
114
|
+
)
|
115
|
+
else:
|
116
|
+
print(" - (No matches found even after refinement.)")
|
117
|
+
|
118
|
+
return verified_chunks
|
@@ -0,0 +1,285 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import difflib
|
4
|
+
import logging
|
5
|
+
from typing import NamedTuple, Optional, Self, TypeAlias
|
6
|
+
|
7
|
+
from pydantic import Field
|
8
|
+
from regex import DOTALL
|
9
|
+
from regex import compile as regex_compile
|
10
|
+
from regex import error as regex_error
|
11
|
+
|
12
|
+
from ...language_model import Chatterer
|
13
|
+
from ...messages import HumanMessage
|
14
|
+
from .chunks import CitationChunk
|
15
|
+
from .reference import MultiMatchRegex, Reference, SingleMatchCitation
|
16
|
+
from .utils import MatchedText
|
17
|
+
|
18
|
+
ModelAndSteps: TypeAlias = tuple[Chatterer, int]
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
class Citations(NamedTuple):
|
23
|
+
"""
|
24
|
+
Holds the verified citation chunks and their matching information.
|
25
|
+
"""
|
26
|
+
|
27
|
+
name: str
|
28
|
+
references: dict[Reference, list[ReferencedTextMatch]]
|
29
|
+
|
30
|
+
@classmethod
|
31
|
+
def from_unverified(
|
32
|
+
cls,
|
33
|
+
unverified_chunk: CitationChunk,
|
34
|
+
document: str,
|
35
|
+
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for refinement
|
36
|
+
) -> Self:
|
37
|
+
subject: str = unverified_chunk.subject
|
38
|
+
self: Self = cls(name=subject, references={})
|
39
|
+
for reference in unverified_chunk.references or ():
|
40
|
+
if isinstance(reference, SingleMatchCitation):
|
41
|
+
try:
|
42
|
+
mt: Optional[ReferencedTextMatch] = ReferencedTextMatch.from_citation(
|
43
|
+
subject=subject,
|
44
|
+
citation=reference,
|
45
|
+
document=document,
|
46
|
+
model_and_refinement_steps=model_and_refinement_steps,
|
47
|
+
)
|
48
|
+
if mt is None or not mt.text.strip():
|
49
|
+
logger.warning(f"Failed to extract text for citation {reference} in subject '{subject}'.")
|
50
|
+
else:
|
51
|
+
self.references[reference] = [mt]
|
52
|
+
except Exception as e:
|
53
|
+
logger.error(f"Error processing citation {reference} for subject '{subject}': {e}")
|
54
|
+
else:
|
55
|
+
try:
|
56
|
+
regex_matches: list[ReferencedTextMatch] = ReferencedTextMatch.from_regex(
|
57
|
+
regex=reference, subject=subject, document=document
|
58
|
+
)
|
59
|
+
if regex_matches:
|
60
|
+
self.references[reference] = regex_matches
|
61
|
+
except regex_error as e:
|
62
|
+
logger.error(f"Regex error for subject '{subject}' with pattern '{reference}': {e}")
|
63
|
+
return self
|
64
|
+
|
65
|
+
|
66
|
+
class ReferencedTextMatch(MatchedText):
|
67
|
+
@classmethod
|
68
|
+
def from_citation(
|
69
|
+
cls,
|
70
|
+
subject: str,
|
71
|
+
citation: SingleMatchCitation,
|
72
|
+
document: str,
|
73
|
+
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for quality-check refinement
|
74
|
+
) -> Optional[Self]:
|
75
|
+
"""
|
76
|
+
Extract text from the document using the adjusted citation indices.
|
77
|
+
Additionally, if a language model is provided, evaluate the extraction quality
|
78
|
+
and refine it if needed.
|
79
|
+
"""
|
80
|
+
citation_id: Optional[SingleMatchCitationWithIndex] = SingleMatchCitationWithIndex.from_indexless_citation(
|
81
|
+
indexless_citation=citation,
|
82
|
+
document=document,
|
83
|
+
subject=subject,
|
84
|
+
model_and_refinement_steps=model_and_refinement_steps,
|
85
|
+
)
|
86
|
+
if citation_id is None:
|
87
|
+
return
|
88
|
+
|
89
|
+
return cls(
|
90
|
+
start_idx=citation_id.start,
|
91
|
+
end_idx=citation_id.end,
|
92
|
+
text=citation_id.extracted_text,
|
93
|
+
)
|
94
|
+
|
95
|
+
@classmethod
|
96
|
+
def from_regex(cls, regex: MultiMatchRegex, subject: str, document: str) -> list[Self]:
|
97
|
+
"""
|
98
|
+
Apply the given regex to the document and return all matching results as a list of MatchedText.
|
99
|
+
"""
|
100
|
+
try:
|
101
|
+
compiled_pattern = regex_compile(regex.regular_expression, flags=DOTALL)
|
102
|
+
except regex_error as e:
|
103
|
+
logger.error(f"Regex compilation error for pattern /{regex.regular_expression}/: {e}")
|
104
|
+
raise e
|
105
|
+
try:
|
106
|
+
matches = list(compiled_pattern.finditer(document, timeout=1.0))
|
107
|
+
except regex_error as e:
|
108
|
+
logger.error(f"Regex matching error for pattern /{regex.regular_expression}/: {e}")
|
109
|
+
raise e
|
110
|
+
return [cls(start_idx=m.start(), end_idx=m.end(), text=m.group()) for m in matches]
|
111
|
+
|
112
|
+
|
113
|
+
class SingleMatchCitationWithIndex(SingleMatchCitation):
|
114
|
+
start: int = Field(description="The computed start index of the citation in the document.")
|
115
|
+
end: int = Field(description="The computed end index of the citation in the document.")
|
116
|
+
extracted_text: str = Field(description="The extracted text from the document using the computed indices.")
|
117
|
+
|
118
|
+
@classmethod
|
119
|
+
def from_indexless_citation(
|
120
|
+
cls,
|
121
|
+
indexless_citation: SingleMatchCitation,
|
122
|
+
document: str,
|
123
|
+
subject: str,
|
124
|
+
model_and_refinement_steps: Optional[ModelAndSteps] = None, # Optional LLM for quality-check refinement
|
125
|
+
) -> Optional[Self]:
|
126
|
+
"""
|
127
|
+
Compute the correct start and end indices for the citation based on the provided text snippets.
|
128
|
+
This method ignores any indices provided by the LLM and computes them using a similarity-based search.
|
129
|
+
If multiple high-scoring candidates are found, the one with the highest effective score is chosen.
|
130
|
+
"""
|
131
|
+
if model_and_refinement_steps is None:
|
132
|
+
model = None
|
133
|
+
num_refinement_steps = 1
|
134
|
+
else:
|
135
|
+
model, num_refinement_steps = model_and_refinement_steps
|
136
|
+
for _ in range(num_refinement_steps):
|
137
|
+
result = cls.from_indexless_citation_with_refinement(
|
138
|
+
indexless_citation=indexless_citation,
|
139
|
+
document=document,
|
140
|
+
subject=subject,
|
141
|
+
chatterer=model,
|
142
|
+
)
|
143
|
+
if result is None:
|
144
|
+
continue
|
145
|
+
return result
|
146
|
+
|
147
|
+
@staticmethod
|
148
|
+
def find_best_match_index(snippet: str, document: str, target_index: int) -> Optional[int]:
|
149
|
+
"""
|
150
|
+
Extracts a candidate window centered around the specified target_index,
|
151
|
+
with a size equal to the length of the snippet. Within this region,
|
152
|
+
it calculates the similarity with the snippet using a sliding window approach.
|
153
|
+
|
154
|
+
The index of the candidate with the highest effective_score is returned.
|
155
|
+
If no suitable candidate is found, the target_index is returned.
|
156
|
+
|
157
|
+
Note: If multiple high-scoring candidates are found, the one with the highest effective score is chosen.
|
158
|
+
"""
|
159
|
+
snippet = snippet.strip()
|
160
|
+
if not snippet:
|
161
|
+
return
|
162
|
+
snippet_len: int = len(snippet)
|
163
|
+
best_index: int = -1
|
164
|
+
best_effective_score = 0.0
|
165
|
+
max_radius = max(target_index, len(document) - target_index)
|
166
|
+
for offset in range(max_radius):
|
167
|
+
for candidate_index in (
|
168
|
+
target_index - offset,
|
169
|
+
target_index + offset,
|
170
|
+
):
|
171
|
+
if candidate_index < 0 or candidate_index + snippet_len > len(document):
|
172
|
+
continue
|
173
|
+
candidate_segment = document[candidate_index : min(candidate_index + snippet_len, len(document))]
|
174
|
+
if len(candidate_segment) < snippet_len:
|
175
|
+
continue
|
176
|
+
local_best_similarity = 0.0
|
177
|
+
local_best_offset = 0
|
178
|
+
for i in range(0, len(candidate_segment) - snippet_len + 1):
|
179
|
+
candidate_window = candidate_segment[i : i + snippet_len]
|
180
|
+
similarity = difflib.SequenceMatcher(None, snippet, candidate_window).ratio()
|
181
|
+
if similarity > local_best_similarity:
|
182
|
+
local_best_similarity = similarity
|
183
|
+
local_best_offset = i
|
184
|
+
candidate_final_index = candidate_index + local_best_offset
|
185
|
+
if candidate_final_index + snippet_len > len(document):
|
186
|
+
candidate_final_index = len(document) - snippet_len
|
187
|
+
if local_best_similarity > best_effective_score:
|
188
|
+
best_effective_score = local_best_similarity
|
189
|
+
best_index = candidate_final_index
|
190
|
+
if not 0 <= best_index < len(document):
|
191
|
+
logger.warning(f"Snippet '{snippet}' not found with sufficient similarity.")
|
192
|
+
return
|
193
|
+
else:
|
194
|
+
logger.debug(
|
195
|
+
f"Found best match for snippet '{snippet}' at index {best_index} with effective score {best_effective_score:.2f}."
|
196
|
+
)
|
197
|
+
return best_index
|
198
|
+
|
199
|
+
@classmethod
|
200
|
+
def from_indexless_citation_with_refinement(
|
201
|
+
cls,
|
202
|
+
indexless_citation: SingleMatchCitation,
|
203
|
+
document: str,
|
204
|
+
subject: str,
|
205
|
+
chatterer: Optional[Chatterer],
|
206
|
+
) -> Optional[Self]:
|
207
|
+
if chatterer is None:
|
208
|
+
logger.error("No LLM provided for indexless citation refinement.")
|
209
|
+
new_indexless_citation = indexless_citation
|
210
|
+
else:
|
211
|
+
new_indexless_citation = chatterer.generate_pydantic(
|
212
|
+
response_model=SingleMatchCitation,
|
213
|
+
messages=[
|
214
|
+
HumanMessage(
|
215
|
+
content=(
|
216
|
+
"I tried to find the `SNIPPET` in the `original-raw-document` to extract a text citation for the subject `subject-to-parse`, but I couldn't find it. "
|
217
|
+
"Please provide `citation-start-from` and `citation-end-at` to help me locate the correct text span.\n"
|
218
|
+
"---\n"
|
219
|
+
"<original-raw-document>\n"
|
220
|
+
f"{document}\n"
|
221
|
+
"</original-raw-document>\n"
|
222
|
+
"---\n"
|
223
|
+
"<subject-to-parse>\n"
|
224
|
+
f"{subject}\n"
|
225
|
+
"</subject-to-parse>\n"
|
226
|
+
"---\n"
|
227
|
+
"<current-citation-start-from>\n"
|
228
|
+
f"{indexless_citation.start_from}\n"
|
229
|
+
"</current-citation-start-from>\n"
|
230
|
+
"---\n"
|
231
|
+
"<current-citation-end-at>\n"
|
232
|
+
f"{indexless_citation.end_at}\n"
|
233
|
+
"</current-citation-end-at>\n"
|
234
|
+
)
|
235
|
+
),
|
236
|
+
],
|
237
|
+
)
|
238
|
+
doc_len: int = len(document)
|
239
|
+
|
240
|
+
start_snippet: str = new_indexless_citation.start_from.strip()
|
241
|
+
if start_snippet:
|
242
|
+
target_for_start = document.find(start_snippet)
|
243
|
+
if target_for_start == -1:
|
244
|
+
target_for_start = 0
|
245
|
+
new_start: Optional[int] = cls.find_best_match_index(
|
246
|
+
snippet=start_snippet,
|
247
|
+
document=document,
|
248
|
+
target_index=target_for_start,
|
249
|
+
)
|
250
|
+
if new_start is None:
|
251
|
+
return
|
252
|
+
else:
|
253
|
+
logger.warning("No start_text provided")
|
254
|
+
return
|
255
|
+
end_snippet: str = new_indexless_citation.end_at.strip()
|
256
|
+
if end_snippet:
|
257
|
+
target_for_end = document.find(end_snippet, new_start)
|
258
|
+
if target_for_end == -1:
|
259
|
+
target_for_end = new_start
|
260
|
+
candidate_end: Optional[int] = cls.find_best_match_index(
|
261
|
+
snippet=end_snippet,
|
262
|
+
document=document,
|
263
|
+
target_index=target_for_end,
|
264
|
+
)
|
265
|
+
if candidate_end is None:
|
266
|
+
return
|
267
|
+
new_end: int = candidate_end + len(end_snippet)
|
268
|
+
else:
|
269
|
+
logger.warning("No end_text provided; defaulting end index to document length.")
|
270
|
+
new_end = doc_len
|
271
|
+
if not 0 <= new_start < new_end <= doc_len:
|
272
|
+
logger.error(f"Adjusted citation indices invalid: start {new_start}, end {new_end}, doc_len {doc_len}.")
|
273
|
+
return
|
274
|
+
try:
|
275
|
+
extracted_text = document[new_start:new_end]
|
276
|
+
except IndexError as e:
|
277
|
+
logger.error(f"Error extracting text using adjusted citation indices: {e}")
|
278
|
+
return
|
279
|
+
return cls(
|
280
|
+
start=new_start,
|
281
|
+
end=new_end,
|
282
|
+
start_from=new_indexless_citation.start_from,
|
283
|
+
end_at=new_indexless_citation.end_at,
|
284
|
+
extracted_text=extracted_text,
|
285
|
+
)
|
@@ -0,0 +1,157 @@
|
|
1
|
+
"""
|
2
|
+
ragent/prompt/citation_chunking.py
|
3
|
+
|
4
|
+
This module defines prompt constants for citation chunking.
|
5
|
+
The LLM is expected to return JSON objects that include only the text snippets for the beginning and end of the citation span.
|
6
|
+
The character indices will be computed in a post‐processing step.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from functools import cache
|
10
|
+
|
11
|
+
|
12
|
+
@cache
|
13
|
+
def generate_instruction() -> str:
|
14
|
+
from .chunks import CitationChunk, CitationChunks
|
15
|
+
from .reference import (
|
16
|
+
MultiMatchRegex,
|
17
|
+
SingleMatchCitation,
|
18
|
+
)
|
19
|
+
|
20
|
+
return (
|
21
|
+
"You are an AI specialized in 'citation-based text chunking'.\n"
|
22
|
+
"Given a document, perform the following steps:\n"
|
23
|
+
"1) Identify the major topics in the document.\n"
|
24
|
+
"2) For each topic, provide a list of citation objects indicating the text snippets at the beginning and end of the relevant paragraph(s) for that topic.\n\n"
|
25
|
+
"Important:\n"
|
26
|
+
"- Return citation objects with 'start_text' and 'end_text' fields to precisely capture the text span. Do NOT include character indices.\n"
|
27
|
+
"- If a regular expression based matching is more appropriate for a topic (e.g. for multiple matches), you may include a regex object of type 'multi_match_regex'.\n\n"
|
28
|
+
"Return JSON strictly in the following format:\n"
|
29
|
+
"{json_example}\n\n"
|
30
|
+
"1) Return only valid JSON (no extra keys).\n"
|
31
|
+
"2) Do NOT include any commentary.\n"
|
32
|
+
"3) Ensure that the citations capture the entire relevant paragraph without overlap or omission."
|
33
|
+
).format(
|
34
|
+
json_example=CitationChunks(
|
35
|
+
citation_chunks=[
|
36
|
+
CitationChunk(
|
37
|
+
subject="Quantum Advantage",
|
38
|
+
references=[
|
39
|
+
SingleMatchCitation(
|
40
|
+
start_from="Starting snippet...",
|
41
|
+
end_at="... Ending snippet",
|
42
|
+
),
|
43
|
+
MultiMatchRegex(
|
44
|
+
type="multi_match_regex",
|
45
|
+
regular_expression="Some.*?regex.*?pattern",
|
46
|
+
),
|
47
|
+
],
|
48
|
+
),
|
49
|
+
]
|
50
|
+
).model_dump_json(indent=2)
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
@cache
|
55
|
+
def generate_human_assistant_fewshot_examples() -> list[tuple[str, str]]:
|
56
|
+
from .chunks import CitationChunk, CitationChunks
|
57
|
+
from .reference import SingleMatchCitation
|
58
|
+
|
59
|
+
return [
|
60
|
+
(
|
61
|
+
"Agent-Semantic Chunking of the following text:\n\n"
|
62
|
+
"Title: Revolutionary Breakthrough in Quantum Computing\n\n"
|
63
|
+
"In a landmark development, researchers at the National Quantum Laboratory unveiled a quantum computer "
|
64
|
+
"that demonstrates clear quantum advantage by performing computations that are infeasible on classical systems.\n\n"
|
65
|
+
"The breakthrough is the result of years of rigorous research and international collaboration. "
|
66
|
+
"The system leverages entanglement and superposition to process complex algorithms at unprecedented speeds.\n\n"
|
67
|
+
"However, practical applications are still emerging, and experts caution about scalability challenges. "
|
68
|
+
"Meanwhile, several tech giants are expressing keen interest in integrating quantum technology into future products.\n\n"
|
69
|
+
"Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
|
70
|
+
CitationChunks(
|
71
|
+
citation_chunks=[
|
72
|
+
CitationChunk(
|
73
|
+
subject="Quantum Advantage",
|
74
|
+
references=[
|
75
|
+
SingleMatchCitation(
|
76
|
+
start_from="In a landmark development",
|
77
|
+
end_at="on classical systems.",
|
78
|
+
),
|
79
|
+
],
|
80
|
+
),
|
81
|
+
CitationChunk(
|
82
|
+
subject="Research Collaboration",
|
83
|
+
references=[
|
84
|
+
SingleMatchCitation(
|
85
|
+
start_from="The breakthrough is the result",
|
86
|
+
end_at="unprecedented speeds.",
|
87
|
+
),
|
88
|
+
],
|
89
|
+
),
|
90
|
+
CitationChunk(
|
91
|
+
subject="Practical Challenges",
|
92
|
+
references=[
|
93
|
+
SingleMatchCitation(
|
94
|
+
start_from="However, practical applications",
|
95
|
+
end_at="scalability challenges.",
|
96
|
+
),
|
97
|
+
],
|
98
|
+
),
|
99
|
+
CitationChunk(
|
100
|
+
subject="Industry Interest",
|
101
|
+
references=[
|
102
|
+
SingleMatchCitation(
|
103
|
+
start_from="Meanwhile, several tech giants",
|
104
|
+
end_at="future products.",
|
105
|
+
),
|
106
|
+
],
|
107
|
+
),
|
108
|
+
]
|
109
|
+
).model_dump_json(indent=2),
|
110
|
+
),
|
111
|
+
(
|
112
|
+
"Agent-Semantic Chunking of the following text:\n\n"
|
113
|
+
"Title: Rising Seas and Coastal Erosion: A Global Crisis\n\n"
|
114
|
+
"Communities worldwide face the impacts of climate change as rising sea levels lead to accelerated coastal erosion, "
|
115
|
+
"jeopardizing homes and critical infrastructure.\n\n"
|
116
|
+
'In a small coastal town, residents noted that "the encroaching sea" has already begun to claim beachfront properties, '
|
117
|
+
"prompting local authorities to implement emergency measures.\n\n"
|
118
|
+
"Environmental experts warn that without significant intervention, the frequency and severity of these events will increase, "
|
119
|
+
"further exacerbating the global climate crisis.\n\n"
|
120
|
+
"Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
|
121
|
+
CitationChunks(
|
122
|
+
citation_chunks=[
|
123
|
+
CitationChunk(
|
124
|
+
subject="Coastal Erosion Impact",
|
125
|
+
references=[
|
126
|
+
SingleMatchCitation(
|
127
|
+
start_from="Communities worldwide face the impacts",
|
128
|
+
end_at="critical infrastructure.",
|
129
|
+
),
|
130
|
+
],
|
131
|
+
),
|
132
|
+
CitationChunk(
|
133
|
+
subject="Local Emergency Response",
|
134
|
+
references=[
|
135
|
+
SingleMatchCitation(
|
136
|
+
start_from="In a small coastal town",
|
137
|
+
end_at="emergency measures.",
|
138
|
+
),
|
139
|
+
],
|
140
|
+
),
|
141
|
+
CitationChunk(
|
142
|
+
subject="Expert Warning",
|
143
|
+
references=[
|
144
|
+
SingleMatchCitation(
|
145
|
+
start_from="Environmental experts warn",
|
146
|
+
end_at="global climate crisis.",
|
147
|
+
),
|
148
|
+
],
|
149
|
+
),
|
150
|
+
]
|
151
|
+
).model_dump_json(indent=2),
|
152
|
+
),
|
153
|
+
]
|
154
|
+
|
155
|
+
|
156
|
+
def generate_fewshot_affirmative_response() -> str:
|
157
|
+
return "Great! I will now perform the citation-based chunking. Please provide the document to process!"
|