chatterer 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {chatterer-0.1.6 → chatterer-0.1.8}/PKG-INFO +2 -2
  2. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/__init__.py +12 -0
  3. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/language_model.py +78 -34
  4. chatterer-0.1.8/chatterer/messages.py +8 -0
  5. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/strategies/atom_of_thoughts.py +1 -2
  6. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/__init__.py +2 -0
  7. chatterer-0.1.8/chatterer/tools/citation_chunking/__init__.py +3 -0
  8. chatterer-0.1.8/chatterer/tools/citation_chunking/chunks.py +53 -0
  9. chatterer-0.1.8/chatterer/tools/citation_chunking/citation_chunker.py +118 -0
  10. chatterer-0.1.8/chatterer/tools/citation_chunking/citations.py +285 -0
  11. chatterer-0.1.8/chatterer/tools/citation_chunking/prompt.py +157 -0
  12. chatterer-0.1.8/chatterer/tools/citation_chunking/reference.py +26 -0
  13. chatterer-0.1.8/chatterer/tools/citation_chunking/utils.py +138 -0
  14. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/convert_to_text.py +40 -41
  15. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/webpage_to_markdown/playwright_bot.py +28 -10
  16. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/webpage_to_markdown/utils.py +11 -238
  17. chatterer-0.1.8/chatterer/utils/image.py +288 -0
  18. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/PKG-INFO +2 -2
  19. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/SOURCES.txt +10 -1
  20. {chatterer-0.1.6 → chatterer-0.1.8}/pyproject.toml +30 -30
  21. {chatterer-0.1.6 → chatterer-0.1.8}/README.md +0 -0
  22. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/py.typed +0 -0
  23. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/strategies/__init__.py +0 -0
  24. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/strategies/base.py +0 -0
  25. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer/tools/webpage_to_markdown/__init__.py +0 -0
  26. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/dependency_links.txt +0 -0
  27. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/requires.txt +0 -0
  28. {chatterer-0.1.6 → chatterer-0.1.8}/chatterer.egg-info/top_level.txt +0 -0
  29. {chatterer-0.1.6 → chatterer-0.1.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: chatterer
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: The highest-level interface for various LLM APIs.
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -1,4 +1,10 @@
1
1
  from .language_model import Chatterer
2
+ from .messages import (
3
+ AIMessage,
4
+ BaseMessage,
5
+ HumanMessage,
6
+ SystemMessage,
7
+ )
2
8
  from .strategies import (
3
9
  AoTPipeline,
4
10
  AoTPrompter,
@@ -7,6 +13,7 @@ from .strategies import (
7
13
  )
8
14
  from .tools import (
9
15
  anything_to_markdown,
16
+ citation_chunker,
10
17
  get_default_html_to_markdown_options,
11
18
  html_to_markdown,
12
19
  pdf_to_text,
@@ -24,4 +31,9 @@ __all__ = [
24
31
  "pdf_to_text",
25
32
  "get_default_html_to_markdown_options",
26
33
  "pyscripts_to_snippets",
34
+ "citation_chunker",
35
+ "BaseMessage",
36
+ "HumanMessage",
37
+ "SystemMessage",
38
+ "AIMessage",
27
39
  ]
@@ -8,15 +8,18 @@ from typing import (
8
8
  Type,
9
9
  TypeAlias,
10
10
  TypeVar,
11
+ cast,
12
+ overload,
11
13
  )
12
14
 
13
15
  from langchain_core.language_models.base import LanguageModelInput
14
16
  from langchain_core.language_models.chat_models import BaseChatModel
15
- from langchain_core.messages import HumanMessage
16
17
  from langchain_core.runnables.base import Runnable
17
18
  from langchain_core.runnables.config import RunnableConfig
18
19
  from pydantic import BaseModel, Field
19
20
 
21
+ from .messages import AIMessage, BaseMessage, HumanMessage
22
+
20
23
  if TYPE_CHECKING:
21
24
  from instructor import Partial
22
25
 
@@ -32,19 +35,37 @@ class Chatterer(BaseModel):
32
35
  client: BaseChatModel
33
36
  structured_output_kwargs: dict[str, Any] = Field(default_factory=dict)
34
37
 
35
- def __call__(self, messages: LanguageModelInput) -> str:
36
- """
37
- Generate text from the given input messages.
38
-
39
- Args:
40
- messages (LanguageModelInput): Input messages for the language model.
41
- Can be one of the following types:
42
- - str: A single string message.
43
- - list[dict[str, str]]: A list of dictionaries with 'role' and 'content' keys.
44
- - tuple[str, str]: A tuple of strings representing the role and content of a single message.
45
- - list[BaseMessage]: A list of BaseMessage objects. (BaseMessage is a Pydantic model; e.g. can import AIMessage, HumanMessage, SystemMessage from langchain_core.messages)
46
- """
47
- return self.generate(messages)
38
+ @overload
39
+ def __call__(
40
+ self,
41
+ messages: LanguageModelInput,
42
+ response_model: Type[PydanticModelT],
43
+ config: Optional[RunnableConfig] = None,
44
+ stop: Optional[list[str]] = None,
45
+ **kwargs: Any,
46
+ ) -> PydanticModelT: ...
47
+
48
+ @overload
49
+ def __call__(
50
+ self,
51
+ messages: LanguageModelInput,
52
+ response_model: None = None,
53
+ config: Optional[RunnableConfig] = None,
54
+ stop: Optional[list[str]] = None,
55
+ **kwargs: Any,
56
+ ) -> str: ...
57
+
58
+ def __call__(
59
+ self,
60
+ messages: LanguageModelInput,
61
+ response_model: Optional[Type[PydanticModelT]] = None,
62
+ config: Optional[RunnableConfig] = None,
63
+ stop: Optional[list[str]] = None,
64
+ **kwargs: Any,
65
+ ) -> str | PydanticModelT:
66
+ if response_model:
67
+ return self.generate_pydantic(response_model, messages, config, stop, **kwargs)
68
+ return self.client.invoke(input=messages, config=config, stop=stop, **kwargs).text()
48
69
 
49
70
  @classmethod
50
71
  def openai(
@@ -233,6 +254,40 @@ class Chatterer(BaseModel):
233
254
  )
234
255
  ])
235
256
 
257
+ @staticmethod
258
+ def get_num_tokens_from_message(message: BaseMessage) -> Optional[tuple[int, int]]:
259
+ try:
260
+ if isinstance(message, AIMessage) and (usage_metadata := message.usage_metadata):
261
+ input_tokens = int(usage_metadata["input_tokens"])
262
+ output_tokens = int(usage_metadata["output_tokens"])
263
+ else:
264
+ # Dynamic extraction for unknown structures
265
+ input_tokens: Optional[int] = None
266
+ output_tokens: Optional[int] = None
267
+
268
+ def _find_tokens(obj: object) -> None:
269
+ nonlocal input_tokens, output_tokens
270
+ if isinstance(obj, dict):
271
+ for key, value in cast(dict[object, object], obj).items():
272
+ if isinstance(value, int):
273
+ if "input" in str(key) or "prompt" in str(key):
274
+ input_tokens = value
275
+ elif "output" in str(key) or "completion" in str(key):
276
+ output_tokens = value
277
+ else:
278
+ _find_tokens(value)
279
+ elif isinstance(obj, list):
280
+ for item in cast(list[object], obj):
281
+ _find_tokens(item)
282
+
283
+ _find_tokens(message.model_dump())
284
+
285
+ if input_tokens is None or output_tokens is None:
286
+ return None
287
+ return input_tokens, output_tokens
288
+ except Exception:
289
+ return None
290
+
236
291
 
237
292
  def with_structured_output(
238
293
  client: BaseChatModel,
@@ -255,36 +310,25 @@ if __name__ == "__main__":
255
310
 
256
311
  # === Synchronous Tests ===
257
312
 
258
- # 1. generate
313
+ # generate
259
314
  print("=== Synchronous generate ===")
260
- result_sync = chatterer.generate(prompt)
315
+ result_sync = chatterer(prompt)
261
316
  print("Result (generate):", result_sync)
262
317
 
263
- # 2. __call__
264
- print("\n=== Synchronous __call__ ===")
265
- result_call = chatterer(prompt)
266
- print("Result (__call__):", result_call)
267
-
268
- # 3. generate_stream
318
+ # generate_stream
269
319
  print("\n=== Synchronous generate_stream ===")
270
320
  for i, chunk in enumerate(chatterer.generate_stream(prompt)):
271
321
  print(f"Chunk {i}:", chunk)
272
322
 
273
- # 4. generate_pydantic
323
+ # generate_pydantic
274
324
  print("\n=== Synchronous generate_pydantic ===")
275
- try:
276
- result_pydantic = chatterer.generate_pydantic(Propositions, prompt)
277
- print("Result (generate_pydantic):", result_pydantic)
278
- except Exception as e:
279
- print("Error in generate_pydantic:", e)
325
+ result_pydantic = chatterer(prompt, Propositions)
326
+ print("Result (generate_pydantic):", result_pydantic)
280
327
 
281
- # 5. generate_pydantic_stream
328
+ # generate_pydantic_stream
282
329
  print("\n=== Synchronous generate_pydantic_stream ===")
283
- try:
284
- for i, chunk in enumerate(chatterer.generate_pydantic_stream(Propositions, prompt)):
285
- print(f"Pydantic Chunk {i}:", chunk)
286
- except Exception as e:
287
- print("Error in generate_pydantic_stream:", e)
330
+ for i, chunk in enumerate(chatterer.generate_pydantic_stream(Propositions, prompt)):
331
+ print(f"Pydantic Chunk {i}:", chunk)
288
332
 
289
333
  # === Asynchronous Tests ===
290
334
 
@@ -0,0 +1,8 @@
1
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage
2
+
3
+ __all__ = [
4
+ "AIMessage",
5
+ "BaseMessage",
6
+ "HumanMessage",
7
+ "SystemMessage",
8
+ ]
@@ -6,11 +6,10 @@ from dataclasses import dataclass, field
6
6
  from enum import StrEnum
7
7
  from typing import Optional, Type, TypeVar
8
8
 
9
- from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
10
9
  from pydantic import BaseModel, Field, ValidationError
11
10
 
12
- # Import your Chatterer interface (do not remove)
13
11
  from ..language_model import Chatterer, LanguageModelInput
12
+ from ..messages import AIMessage, BaseMessage, HumanMessage
14
13
  from .base import BaseStrategy
15
14
 
16
15
  # ---------------------------------------------------------------------------------
@@ -1,3 +1,4 @@
1
+ from .citation_chunking import citation_chunker
1
2
  from .convert_to_text import (
2
3
  anything_to_markdown,
3
4
  get_default_html_to_markdown_options,
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "pdf_to_text",
13
14
  "get_default_html_to_markdown_options",
14
15
  "pyscripts_to_snippets",
16
+ "citation_chunker",
15
17
  ]
@@ -0,0 +1,3 @@
1
+ from .citation_chunker import citation_chunker
2
+
3
+ __all__ = ["citation_chunker"]
@@ -0,0 +1,53 @@
1
+ import logging
2
+ from typing import Callable, Optional, Self
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from ...language_model import Chatterer
7
+ from ...messages import AIMessage, BaseMessage, HumanMessage
8
+ from .prompt import (
9
+ generate_fewshot_affirmative_response,
10
+ generate_human_assistant_fewshot_examples,
11
+ generate_instruction,
12
+ )
13
+ from .reference import Reference
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class CitationChunk(BaseModel):
19
+ subject: str = Field(description="The main topic or subject that the citations capture.")
20
+ references: list[Reference] = Field(description="A list of citation objects and/or regex patterns for the subject.")
21
+
22
+
23
+ class CitationChunks(BaseModel):
24
+ citation_chunks: list[CitationChunk] = Field(
25
+ description="A list of citation chunks, each capturing a specific topic in the document."
26
+ )
27
+
28
+ @classmethod
29
+ def from_llm(
30
+ cls,
31
+ chatterer: Chatterer,
32
+ document: str,
33
+ fewshot_examples_generator: Optional[
34
+ Callable[[], list[tuple[str, str]]]
35
+ ] = generate_human_assistant_fewshot_examples,
36
+ instruction_generator: Optional[Callable[[], str]] = generate_instruction,
37
+ fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
38
+ ) -> Self:
39
+ messages: list[BaseMessage] = []
40
+ if instruction_generator:
41
+ messages.append(HumanMessage(content=instruction_generator()))
42
+ if fewshot_examples_generator is not None:
43
+ if fewshot_affirmative_response:
44
+ messages.append(AIMessage(content=generate_fewshot_affirmative_response()))
45
+ for human_ask, ai_answer in fewshot_examples_generator():
46
+ messages.append(HumanMessage(content=human_ask))
47
+ messages.append(AIMessage(content=ai_answer))
48
+ messages.append(HumanMessage(content=document))
49
+ try:
50
+ return chatterer.generate_pydantic(response_model=cls, messages=messages)
51
+ except Exception as e:
52
+ logger.error(f"Error obtaining CitationChunks from LLM: {e}")
53
+ raise e
@@ -0,0 +1,118 @@
1
+ import logging
2
+ from typing import Callable, NamedTuple, Optional, Self
3
+
4
+ import colorama
5
+ from colorama import Fore
6
+
7
+ from ...language_model import Chatterer
8
+ from .chunks import CitationChunks
9
+ from .citations import Citations
10
+ from .prompt import (
11
+ generate_fewshot_affirmative_response,
12
+ generate_human_assistant_fewshot_examples,
13
+ generate_instruction,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+ colorama.init()
18
+
19
+
20
+ class GlobalCoverage(NamedTuple):
21
+ coverage: float
22
+ matched_intervals: list[tuple[int, int]]
23
+
24
+ @staticmethod
25
+ def merge_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]:
26
+ if not intervals:
27
+ return []
28
+ sorted_intervals = sorted(intervals, key=lambda x: x[0])
29
+ merged: list[tuple[int, int]] = [sorted_intervals[0]]
30
+ for current in sorted_intervals[1:]:
31
+ prev = merged[-1]
32
+ if current[0] <= prev[1]:
33
+ merged[-1] = (prev[0], max(prev[1], current[1]))
34
+ else:
35
+ merged.append(current)
36
+ return merged
37
+
38
+ @classmethod
39
+ def from_verified_citations(cls, verified_chunks: list[Citations], document: str) -> Self:
40
+ all_intervals: list[tuple[int, int]] = []
41
+ for chunk in verified_chunks:
42
+ for matches in chunk.references.values():
43
+ for m in matches:
44
+ all_intervals.append((m.start_idx, m.end_idx))
45
+ merged: list[tuple[int, int]] = cls.merge_intervals(all_intervals)
46
+ doc_length: int = len(document)
47
+ total_matched = sum((e - s for s, e in merged))
48
+ coverage: float = total_matched / doc_length if doc_length > 0 else 0.0
49
+ return cls(coverage=coverage, matched_intervals=merged)
50
+
51
+
52
+ def citation_chunker(
53
+ document: str,
54
+ chatterer: Chatterer,
55
+ global_coverage_threshold: float = 0.9,
56
+ num_refinement_steps: int = 3,
57
+ fewshot_examples_generator: Optional[
58
+ Callable[[], list[tuple[str, str]]]
59
+ ] = generate_human_assistant_fewshot_examples,
60
+ instruction_generator: Optional[Callable[[], str]] = generate_instruction,
61
+ fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
62
+ test_global_coverage: bool = False,
63
+ ) -> list[Citations]:
64
+ """
65
+ 1) Obtain CitationChunks via the LLM.
66
+ 2) Process each chunk to extract MatchedText using snippet-based index correction.
67
+ 3) Calculate overall document coverage and print results.
68
+ """
69
+ unverified_chunks: CitationChunks = CitationChunks.from_llm(
70
+ chatterer=chatterer,
71
+ document=document,
72
+ fewshot_examples_generator=fewshot_examples_generator,
73
+ instruction_generator=instruction_generator,
74
+ fewshot_affirmative_response=fewshot_affirmative_response,
75
+ )
76
+
77
+ verified_chunks: list[Citations] = []
78
+ for chunk in unverified_chunks.citation_chunks:
79
+ try:
80
+ vc: Citations = Citations.from_unverified(
81
+ unverified_chunk=chunk,
82
+ document=document,
83
+ model_and_refinement_steps=(chatterer, num_refinement_steps),
84
+ )
85
+ verified_chunks.append(vc)
86
+ except Exception as e:
87
+ logger.error(f"Error processing chunk for subject '{chunk.subject}': {e}")
88
+
89
+ if test_global_coverage:
90
+ gc = GlobalCoverage.from_verified_citations(verified_chunks, document)
91
+ logger.info(f"Global coverage: {gc.coverage * 100:.1f}%")
92
+ if gc.coverage < global_coverage_threshold:
93
+ logger.info(
94
+ f"Global coverage {gc.coverage * 100:.1f}% is below the threshold {global_coverage_threshold * 100:.1f}%."
95
+ )
96
+ print("=== Final Global Coverage Check ===")
97
+ print(f"Overall coverage: {gc.coverage * 100:.1f}% of the document.")
98
+ if gc.matched_intervals:
99
+ print("Merged matched intervals:")
100
+ for interval in gc.matched_intervals:
101
+ print(f" - {interval}")
102
+ else:
103
+ print("No matches found across all chunks.")
104
+ print("\n=== Raw Semantic Chunking Result ===")
105
+ for vc in verified_chunks:
106
+ print(f"{Fore.LIGHTGREEN_EX}[SUBJECT] {Fore.GREEN}{vc.name}{Fore.RESET}")
107
+ if vc.references:
108
+ for source_key, matches in vc.references.items():
109
+ print(f"{Fore.LIGHTBLUE_EX} [SOURCE] {Fore.BLUE}{source_key}{Fore.RESET}")
110
+ for mt in matches:
111
+ snippet = repr(mt.text)
112
+ print(
113
+ f" {Fore.LIGHTYELLOW_EX}[MATCH @ {mt.start_idx}~{mt.end_idx}] {Fore.YELLOW}{snippet}{Fore.RESET}"
114
+ )
115
+ else:
116
+ print(" - (No matches found even after refinement.)")
117
+
118
+ return verified_chunks