chatterer 0.1.6__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {chatterer-0.1.6 → chatterer-0.1.8}/PKG-INFO +2 -2
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/__init__.py +12 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/language_model.py +78 -34
- chatterer-0.1.8/chatterer/messages.py +8 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/strategies/atom_of_thoughts.py +1 -2
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/__init__.py +2 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/__init__.py +3 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/chunks.py +53 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/citation_chunker.py +118 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/citations.py +285 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/prompt.py +157 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/reference.py +26 -0
- chatterer-0.1.8/chatterer/tools/citation_chunking/utils.py +138 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/convert_to_text.py +40 -41
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/webpage_to_markdown/playwright_bot.py +28 -10
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/webpage_to_markdown/utils.py +11 -238
- chatterer-0.1.8/chatterer/utils/image.py +288 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/PKG-INFO +2 -2
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/SOURCES.txt +10 -1
- {chatterer-0.1.6 → chatterer-0.1.8}/pyproject.toml +30 -30
- {chatterer-0.1.6 → chatterer-0.1.8}/README.md +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/py.typed +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/strategies/__init__.py +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/strategies/base.py +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/webpage_to_markdown/__init__.py +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/dependency_links.txt +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/requires.txt +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/top_level.txt +0 -0
- {chatterer-0.1.6 → chatterer-0.1.8}/setup.cfg +0 -0
@@ -1,4 +1,10 @@
|
|
1
1
|
from .language_model import Chatterer
|
2
|
+
from .messages import (
|
3
|
+
AIMessage,
|
4
|
+
BaseMessage,
|
5
|
+
HumanMessage,
|
6
|
+
SystemMessage,
|
7
|
+
)
|
2
8
|
from .strategies import (
|
3
9
|
AoTPipeline,
|
4
10
|
AoTPrompter,
|
@@ -7,6 +13,7 @@ from .strategies import (
|
|
7
13
|
)
|
8
14
|
from .tools import (
|
9
15
|
anything_to_markdown,
|
16
|
+
citation_chunker,
|
10
17
|
get_default_html_to_markdown_options,
|
11
18
|
html_to_markdown,
|
12
19
|
pdf_to_text,
|
@@ -24,4 +31,9 @@ __all__ = [
|
|
24
31
|
"pdf_to_text",
|
25
32
|
"get_default_html_to_markdown_options",
|
26
33
|
"pyscripts_to_snippets",
|
34
|
+
"citation_chunker",
|
35
|
+
"BaseMessage",
|
36
|
+
"HumanMessage",
|
37
|
+
"SystemMessage",
|
38
|
+
"AIMessage",
|
27
39
|
]
|
@@ -8,15 +8,18 @@ from typing import (
|
|
8
8
|
Type,
|
9
9
|
TypeAlias,
|
10
10
|
TypeVar,
|
11
|
+
cast,
|
12
|
+
overload,
|
11
13
|
)
|
12
14
|
|
13
15
|
from langchain_core.language_models.base import LanguageModelInput
|
14
16
|
from langchain_core.language_models.chat_models import BaseChatModel
|
15
|
-
from langchain_core.messages import HumanMessage
|
16
17
|
from langchain_core.runnables.base import Runnable
|
17
18
|
from langchain_core.runnables.config import RunnableConfig
|
18
19
|
from pydantic import BaseModel, Field
|
19
20
|
|
21
|
+
from .messages import AIMessage, BaseMessage, HumanMessage
|
22
|
+
|
20
23
|
if TYPE_CHECKING:
|
21
24
|
from instructor import Partial
|
22
25
|
|
@@ -32,19 +35,37 @@ class Chatterer(BaseModel):
|
|
32
35
|
client: BaseChatModel
|
33
36
|
structured_output_kwargs: dict[str, Any] = Field(default_factory=dict)
|
34
37
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
38
|
+
@overload
|
39
|
+
def __call__(
|
40
|
+
self,
|
41
|
+
messages: LanguageModelInput,
|
42
|
+
response_model: Type[PydanticModelT],
|
43
|
+
config: Optional[RunnableConfig] = None,
|
44
|
+
stop: Optional[list[str]] = None,
|
45
|
+
**kwargs: Any,
|
46
|
+
) -> PydanticModelT: ...
|
47
|
+
|
48
|
+
@overload
|
49
|
+
def __call__(
|
50
|
+
self,
|
51
|
+
messages: LanguageModelInput,
|
52
|
+
response_model: None = None,
|
53
|
+
config: Optional[RunnableConfig] = None,
|
54
|
+
stop: Optional[list[str]] = None,
|
55
|
+
**kwargs: Any,
|
56
|
+
) -> str: ...
|
57
|
+
|
58
|
+
def __call__(
|
59
|
+
self,
|
60
|
+
messages: LanguageModelInput,
|
61
|
+
response_model: Optional[Type[PydanticModelT]] = None,
|
62
|
+
config: Optional[RunnableConfig] = None,
|
63
|
+
stop: Optional[list[str]] = None,
|
64
|
+
**kwargs: Any,
|
65
|
+
) -> str | PydanticModelT:
|
66
|
+
if response_model:
|
67
|
+
return self.generate_pydantic(response_model, messages, config, stop, **kwargs)
|
68
|
+
return self.client.invoke(input=messages, config=config, stop=stop, **kwargs).text()
|
48
69
|
|
49
70
|
@classmethod
|
50
71
|
def openai(
|
@@ -233,6 +254,40 @@ class Chatterer(BaseModel):
|
|
233
254
|
)
|
234
255
|
])
|
235
256
|
|
257
|
+
@staticmethod
|
258
|
+
def get_num_tokens_from_message(message: BaseMessage) -> Optional[tuple[int, int]]:
|
259
|
+
try:
|
260
|
+
if isinstance(message, AIMessage) and (usage_metadata := message.usage_metadata):
|
261
|
+
input_tokens = int(usage_metadata["input_tokens"])
|
262
|
+
output_tokens = int(usage_metadata["output_tokens"])
|
263
|
+
else:
|
264
|
+
# Dynamic extraction for unknown structures
|
265
|
+
input_tokens: Optional[int] = None
|
266
|
+
output_tokens: Optional[int] = None
|
267
|
+
|
268
|
+
def _find_tokens(obj: object) -> None:
|
269
|
+
nonlocal input_tokens, output_tokens
|
270
|
+
if isinstance(obj, dict):
|
271
|
+
for key, value in cast(dict[object, object], obj).items():
|
272
|
+
if isinstance(value, int):
|
273
|
+
if "input" in str(key) or "prompt" in str(key):
|
274
|
+
input_tokens = value
|
275
|
+
elif "output" in str(key) or "completion" in str(key):
|
276
|
+
output_tokens = value
|
277
|
+
else:
|
278
|
+
_find_tokens(value)
|
279
|
+
elif isinstance(obj, list):
|
280
|
+
for item in cast(list[object], obj):
|
281
|
+
_find_tokens(item)
|
282
|
+
|
283
|
+
_find_tokens(message.model_dump())
|
284
|
+
|
285
|
+
if input_tokens is None or output_tokens is None:
|
286
|
+
return None
|
287
|
+
return input_tokens, output_tokens
|
288
|
+
except Exception:
|
289
|
+
return None
|
290
|
+
|
236
291
|
|
237
292
|
def with_structured_output(
|
238
293
|
client: BaseChatModel,
|
@@ -255,36 +310,25 @@ if __name__ == "__main__":
|
|
255
310
|
|
256
311
|
# === Synchronous Tests ===
|
257
312
|
|
258
|
-
#
|
313
|
+
# generate
|
259
314
|
print("=== Synchronous generate ===")
|
260
|
-
result_sync = chatterer
|
315
|
+
result_sync = chatterer(prompt)
|
261
316
|
print("Result (generate):", result_sync)
|
262
317
|
|
263
|
-
#
|
264
|
-
print("\n=== Synchronous __call__ ===")
|
265
|
-
result_call = chatterer(prompt)
|
266
|
-
print("Result (__call__):", result_call)
|
267
|
-
|
268
|
-
# 3. generate_stream
|
318
|
+
# generate_stream
|
269
319
|
print("\n=== Synchronous generate_stream ===")
|
270
320
|
for i, chunk in enumerate(chatterer.generate_stream(prompt)):
|
271
321
|
print(f"Chunk {i}:", chunk)
|
272
322
|
|
273
|
-
#
|
323
|
+
# generate_pydantic
|
274
324
|
print("\n=== Synchronous generate_pydantic ===")
|
275
|
-
|
276
|
-
|
277
|
-
print("Result (generate_pydantic):", result_pydantic)
|
278
|
-
except Exception as e:
|
279
|
-
print("Error in generate_pydantic:", e)
|
325
|
+
result_pydantic = chatterer(prompt, Propositions)
|
326
|
+
print("Result (generate_pydantic):", result_pydantic)
|
280
327
|
|
281
|
-
#
|
328
|
+
# generate_pydantic_stream
|
282
329
|
print("\n=== Synchronous generate_pydantic_stream ===")
|
283
|
-
|
284
|
-
|
285
|
-
print(f"Pydantic Chunk {i}:", chunk)
|
286
|
-
except Exception as e:
|
287
|
-
print("Error in generate_pydantic_stream:", e)
|
330
|
+
for i, chunk in enumerate(chatterer.generate_pydantic_stream(Propositions, prompt)):
|
331
|
+
print(f"Pydantic Chunk {i}:", chunk)
|
288
332
|
|
289
333
|
# === Asynchronous Tests ===
|
290
334
|
|
@@ -6,11 +6,10 @@ from dataclasses import dataclass, field
|
|
6
6
|
from enum import StrEnum
|
7
7
|
from typing import Optional, Type, TypeVar
|
8
8
|
|
9
|
-
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
|
10
9
|
from pydantic import BaseModel, Field, ValidationError
|
11
10
|
|
12
|
-
# Import your Chatterer interface (do not remove)
|
13
11
|
from ..language_model import Chatterer, LanguageModelInput
|
12
|
+
from ..messages import AIMessage, BaseMessage, HumanMessage
|
14
13
|
from .base import BaseStrategy
|
15
14
|
|
16
15
|
# ---------------------------------------------------------------------------------
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from .citation_chunking import citation_chunker
|
1
2
|
from .convert_to_text import (
|
2
3
|
anything_to_markdown,
|
3
4
|
get_default_html_to_markdown_options,
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
12
13
|
"pdf_to_text",
|
13
14
|
"get_default_html_to_markdown_options",
|
14
15
|
"pyscripts_to_snippets",
|
16
|
+
"citation_chunker",
|
15
17
|
]
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Callable, Optional, Self
|
3
|
+
|
4
|
+
from pydantic import BaseModel, Field
|
5
|
+
|
6
|
+
from ...language_model import Chatterer
|
7
|
+
from ...messages import AIMessage, BaseMessage, HumanMessage
|
8
|
+
from .prompt import (
|
9
|
+
generate_fewshot_affirmative_response,
|
10
|
+
generate_human_assistant_fewshot_examples,
|
11
|
+
generate_instruction,
|
12
|
+
)
|
13
|
+
from .reference import Reference
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class CitationChunk(BaseModel):
|
19
|
+
subject: str = Field(description="The main topic or subject that the citations capture.")
|
20
|
+
references: list[Reference] = Field(description="A list of citation objects and/or regex patterns for the subject.")
|
21
|
+
|
22
|
+
|
23
|
+
class CitationChunks(BaseModel):
|
24
|
+
citation_chunks: list[CitationChunk] = Field(
|
25
|
+
description="A list of citation chunks, each capturing a specific topic in the document."
|
26
|
+
)
|
27
|
+
|
28
|
+
@classmethod
|
29
|
+
def from_llm(
|
30
|
+
cls,
|
31
|
+
chatterer: Chatterer,
|
32
|
+
document: str,
|
33
|
+
fewshot_examples_generator: Optional[
|
34
|
+
Callable[[], list[tuple[str, str]]]
|
35
|
+
] = generate_human_assistant_fewshot_examples,
|
36
|
+
instruction_generator: Optional[Callable[[], str]] = generate_instruction,
|
37
|
+
fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
|
38
|
+
) -> Self:
|
39
|
+
messages: list[BaseMessage] = []
|
40
|
+
if instruction_generator:
|
41
|
+
messages.append(HumanMessage(content=instruction_generator()))
|
42
|
+
if fewshot_examples_generator is not None:
|
43
|
+
if fewshot_affirmative_response:
|
44
|
+
messages.append(AIMessage(content=generate_fewshot_affirmative_response()))
|
45
|
+
for human_ask, ai_answer in fewshot_examples_generator():
|
46
|
+
messages.append(HumanMessage(content=human_ask))
|
47
|
+
messages.append(AIMessage(content=ai_answer))
|
48
|
+
messages.append(HumanMessage(content=document))
|
49
|
+
try:
|
50
|
+
return chatterer.generate_pydantic(response_model=cls, messages=messages)
|
51
|
+
except Exception as e:
|
52
|
+
logger.error(f"Error obtaining CitationChunks from LLM: {e}")
|
53
|
+
raise e
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Callable, NamedTuple, Optional, Self
|
3
|
+
|
4
|
+
import colorama
|
5
|
+
from colorama import Fore
|
6
|
+
|
7
|
+
from ...language_model import Chatterer
|
8
|
+
from .chunks import CitationChunks
|
9
|
+
from .citations import Citations
|
10
|
+
from .prompt import (
|
11
|
+
generate_fewshot_affirmative_response,
|
12
|
+
generate_human_assistant_fewshot_examples,
|
13
|
+
generate_instruction,
|
14
|
+
)
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
colorama.init()
|
18
|
+
|
19
|
+
|
20
|
+
class GlobalCoverage(NamedTuple):
|
21
|
+
coverage: float
|
22
|
+
matched_intervals: list[tuple[int, int]]
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def merge_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]:
|
26
|
+
if not intervals:
|
27
|
+
return []
|
28
|
+
sorted_intervals = sorted(intervals, key=lambda x: x[0])
|
29
|
+
merged: list[tuple[int, int]] = [sorted_intervals[0]]
|
30
|
+
for current in sorted_intervals[1:]:
|
31
|
+
prev = merged[-1]
|
32
|
+
if current[0] <= prev[1]:
|
33
|
+
merged[-1] = (prev[0], max(prev[1], current[1]))
|
34
|
+
else:
|
35
|
+
merged.append(current)
|
36
|
+
return merged
|
37
|
+
|
38
|
+
@classmethod
|
39
|
+
def from_verified_citations(cls, verified_chunks: list[Citations], document: str) -> Self:
|
40
|
+
all_intervals: list[tuple[int, int]] = []
|
41
|
+
for chunk in verified_chunks:
|
42
|
+
for matches in chunk.references.values():
|
43
|
+
for m in matches:
|
44
|
+
all_intervals.append((m.start_idx, m.end_idx))
|
45
|
+
merged: list[tuple[int, int]] = cls.merge_intervals(all_intervals)
|
46
|
+
doc_length: int = len(document)
|
47
|
+
total_matched = sum((e - s for s, e in merged))
|
48
|
+
coverage: float = total_matched / doc_length if doc_length > 0 else 0.0
|
49
|
+
return cls(coverage=coverage, matched_intervals=merged)
|
50
|
+
|
51
|
+
|
52
|
+
def citation_chunker(
|
53
|
+
document: str,
|
54
|
+
chatterer: Chatterer,
|
55
|
+
global_coverage_threshold: float = 0.9,
|
56
|
+
num_refinement_steps: int = 3,
|
57
|
+
fewshot_examples_generator: Optional[
|
58
|
+
Callable[[], list[tuple[str, str]]]
|
59
|
+
] = generate_human_assistant_fewshot_examples,
|
60
|
+
instruction_generator: Optional[Callable[[], str]] = generate_instruction,
|
61
|
+
fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
|
62
|
+
test_global_coverage: bool = False,
|
63
|
+
) -> list[Citations]:
|
64
|
+
"""
|
65
|
+
1) Obtain CitationChunks via the LLM.
|
66
|
+
2) Process each chunk to extract MatchedText using snippet-based index correction.
|
67
|
+
3) Calculate overall document coverage and print results.
|
68
|
+
"""
|
69
|
+
unverified_chunks: CitationChunks = CitationChunks.from_llm(
|
70
|
+
chatterer=chatterer,
|
71
|
+
document=document,
|
72
|
+
fewshot_examples_generator=fewshot_examples_generator,
|
73
|
+
instruction_generator=instruction_generator,
|
74
|
+
fewshot_affirmative_response=fewshot_affirmative_response,
|
75
|
+
)
|
76
|
+
|
77
|
+
verified_chunks: list[Citations] = []
|
78
|
+
for chunk in unverified_chunks.citation_chunks:
|
79
|
+
try:
|
80
|
+
vc: Citations = Citations.from_unverified(
|
81
|
+
unverified_chunk=chunk,
|
82
|
+
document=document,
|
83
|
+
model_and_refinement_steps=(chatterer, num_refinement_steps),
|
84
|
+
)
|
85
|
+
verified_chunks.append(vc)
|
86
|
+
except Exception as e:
|
87
|
+
logger.error(f"Error processing chunk for subject '{chunk.subject}': {e}")
|
88
|
+
|
89
|
+
if test_global_coverage:
|
90
|
+
gc = GlobalCoverage.from_verified_citations(verified_chunks, document)
|
91
|
+
logger.info(f"Global coverage: {gc.coverage * 100:.1f}%")
|
92
|
+
if gc.coverage < global_coverage_threshold:
|
93
|
+
logger.info(
|
94
|
+
f"Global coverage {gc.coverage * 100:.1f}% is below the threshold {global_coverage_threshold * 100:.1f}%."
|
95
|
+
)
|
96
|
+
print("=== Final Global Coverage Check ===")
|
97
|
+
print(f"Overall coverage: {gc.coverage * 100:.1f}% of the document.")
|
98
|
+
if gc.matched_intervals:
|
99
|
+
print("Merged matched intervals:")
|
100
|
+
for interval in gc.matched_intervals:
|
101
|
+
print(f" - {interval}")
|
102
|
+
else:
|
103
|
+
print("No matches found across all chunks.")
|
104
|
+
print("\n=== Raw Semantic Chunking Result ===")
|
105
|
+
for vc in verified_chunks:
|
106
|
+
print(f"{Fore.LIGHTGREEN_EX}[SUBJECT] {Fore.GREEN}{vc.name}{Fore.RESET}")
|
107
|
+
if vc.references:
|
108
|
+
for source_key, matches in vc.references.items():
|
109
|
+
print(f"{Fore.LIGHTBLUE_EX} [SOURCE] {Fore.BLUE}{source_key}{Fore.RESET}")
|
110
|
+
for mt in matches:
|
111
|
+
snippet = repr(mt.text)
|
112
|
+
print(
|
113
|
+
f" {Fore.LIGHTYELLOW_EX}[MATCH @ {mt.start_idx}~{mt.end_idx}] {Fore.YELLOW}{snippet}{Fore.RESET}"
|
114
|
+
)
|
115
|
+
else:
|
116
|
+
print(" - (No matches found even after refinement.)")
|
117
|
+
|
118
|
+
return verified_chunks
|