chatterer 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +93 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__init__.py +0 -0
  5. chatterer/examples/anything_to_markdown.py +95 -0
  6. chatterer/examples/get_code_snippets.py +64 -0
  7. chatterer/examples/login_with_playwright.py +171 -0
  8. chatterer/examples/make_ppt.py +499 -0
  9. chatterer/examples/pdf_to_markdown.py +107 -0
  10. chatterer/examples/pdf_to_text.py +60 -0
  11. chatterer/examples/transcription_api.py +127 -0
  12. chatterer/examples/upstage_parser.py +95 -0
  13. chatterer/examples/webpage_to_markdown.py +79 -0
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +533 -533
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +302 -302
  30. chatterer/tools/convert_to_text.py +447 -447
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +285 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/METADATA +392 -392
  40. chatterer-0.1.19.dist-info/RECORD +44 -0
  41. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/WHEEL +1 -1
  42. chatterer-0.1.19.dist-info/entry_points.txt +10 -0
  43. chatterer-0.1.17.dist-info/RECORD +0 -33
  44. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,3 @@
1
- from .citation_chunker import citation_chunker
2
-
3
- __all__ = ["citation_chunker"]
1
+ from .citation_chunker import citation_chunker
2
+
3
+ __all__ = ["citation_chunker"]
@@ -1,53 +1,53 @@
1
- import logging
2
- from typing import Callable, Optional, Self
3
-
4
- from pydantic import BaseModel, Field
5
-
6
- from ...language_model import Chatterer
7
- from ...messages import AIMessage, BaseMessage, HumanMessage
8
- from .prompt import (
9
- generate_fewshot_affirmative_response,
10
- generate_human_assistant_fewshot_examples,
11
- generate_instruction,
12
- )
13
- from .reference import Reference
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class CitationChunk(BaseModel):
19
- subject: str = Field(description="The main topic or subject that the citations capture.")
20
- references: list[Reference] = Field(description="A list of citation objects and/or regex patterns for the subject.")
21
-
22
-
23
- class CitationChunks(BaseModel):
24
- citation_chunks: list[CitationChunk] = Field(
25
- description="A list of citation chunks, each capturing a specific topic in the document."
26
- )
27
-
28
- @classmethod
29
- def from_llm(
30
- cls,
31
- chatterer: Chatterer,
32
- document: str,
33
- fewshot_examples_generator: Optional[
34
- Callable[[], list[tuple[str, str]]]
35
- ] = generate_human_assistant_fewshot_examples,
36
- instruction_generator: Optional[Callable[[], str]] = generate_instruction,
37
- fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
38
- ) -> Self:
39
- messages: list[BaseMessage] = []
40
- if instruction_generator:
41
- messages.append(HumanMessage(content=instruction_generator()))
42
- if fewshot_examples_generator is not None:
43
- if fewshot_affirmative_response:
44
- messages.append(AIMessage(content=generate_fewshot_affirmative_response()))
45
- for human_ask, ai_answer in fewshot_examples_generator():
46
- messages.append(HumanMessage(content=human_ask))
47
- messages.append(AIMessage(content=ai_answer))
48
- messages.append(HumanMessage(content=document))
49
- try:
50
- return chatterer.generate_pydantic(response_model=cls, messages=messages)
51
- except Exception as e:
52
- logger.error(f"Error obtaining CitationChunks from LLM: {e}")
53
- raise e
1
+ import logging
2
+ from typing import Callable, Optional, Self
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from ...language_model import Chatterer
7
+ from ...messages import AIMessage, BaseMessage, HumanMessage
8
+ from .prompt import (
9
+ generate_fewshot_affirmative_response,
10
+ generate_human_assistant_fewshot_examples,
11
+ generate_instruction,
12
+ )
13
+ from .reference import Reference
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class CitationChunk(BaseModel):
19
+ subject: str = Field(description="The main topic or subject that the citations capture.")
20
+ references: list[Reference] = Field(description="A list of citation objects and/or regex patterns for the subject.")
21
+
22
+
23
+ class CitationChunks(BaseModel):
24
+ citation_chunks: list[CitationChunk] = Field(
25
+ description="A list of citation chunks, each capturing a specific topic in the document."
26
+ )
27
+
28
+ @classmethod
29
+ def from_llm(
30
+ cls,
31
+ chatterer: Chatterer,
32
+ document: str,
33
+ fewshot_examples_generator: Optional[
34
+ Callable[[], list[tuple[str, str]]]
35
+ ] = generate_human_assistant_fewshot_examples,
36
+ instruction_generator: Optional[Callable[[], str]] = generate_instruction,
37
+ fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
38
+ ) -> Self:
39
+ messages: list[BaseMessage] = []
40
+ if instruction_generator:
41
+ messages.append(HumanMessage(content=instruction_generator()))
42
+ if fewshot_examples_generator is not None:
43
+ if fewshot_affirmative_response:
44
+ messages.append(AIMessage(content=generate_fewshot_affirmative_response()))
45
+ for human_ask, ai_answer in fewshot_examples_generator():
46
+ messages.append(HumanMessage(content=human_ask))
47
+ messages.append(AIMessage(content=ai_answer))
48
+ messages.append(HumanMessage(content=document))
49
+ try:
50
+ return chatterer.generate_pydantic(response_model=cls, messages=messages)
51
+ except Exception as e:
52
+ logger.error(f"Error obtaining CitationChunks from LLM: {e}")
53
+ raise e
@@ -1,118 +1,118 @@
1
- import logging
2
- from typing import Callable, NamedTuple, Optional, Self
3
-
4
- import colorama
5
- from colorama import Fore
6
-
7
- from ...language_model import Chatterer
8
- from .chunks import CitationChunks
9
- from .citations import Citations
10
- from .prompt import (
11
- generate_fewshot_affirmative_response,
12
- generate_human_assistant_fewshot_examples,
13
- generate_instruction,
14
- )
15
-
16
- logger = logging.getLogger(__name__)
17
- colorama.init()
18
-
19
-
20
- class GlobalCoverage(NamedTuple):
21
- coverage: float
22
- matched_intervals: list[tuple[int, int]]
23
-
24
- @staticmethod
25
- def merge_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]:
26
- if not intervals:
27
- return []
28
- sorted_intervals = sorted(intervals, key=lambda x: x[0])
29
- merged: list[tuple[int, int]] = [sorted_intervals[0]]
30
- for current in sorted_intervals[1:]:
31
- prev = merged[-1]
32
- if current[0] <= prev[1]:
33
- merged[-1] = (prev[0], max(prev[1], current[1]))
34
- else:
35
- merged.append(current)
36
- return merged
37
-
38
- @classmethod
39
- def from_verified_citations(cls, verified_chunks: list[Citations], document: str) -> Self:
40
- all_intervals: list[tuple[int, int]] = []
41
- for chunk in verified_chunks:
42
- for matches in chunk.references.values():
43
- for m in matches:
44
- all_intervals.append((m.start_idx, m.end_idx))
45
- merged: list[tuple[int, int]] = cls.merge_intervals(all_intervals)
46
- doc_length: int = len(document)
47
- total_matched = sum((e - s for s, e in merged))
48
- coverage: float = total_matched / doc_length if doc_length > 0 else 0.0
49
- return cls(coverage=coverage, matched_intervals=merged)
50
-
51
-
52
- def citation_chunker(
53
- document: str,
54
- chatterer: Chatterer,
55
- global_coverage_threshold: float = 0.9,
56
- num_refinement_steps: int = 3,
57
- fewshot_examples_generator: Optional[
58
- Callable[[], list[tuple[str, str]]]
59
- ] = generate_human_assistant_fewshot_examples,
60
- instruction_generator: Optional[Callable[[], str]] = generate_instruction,
61
- fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
62
- test_global_coverage: bool = False,
63
- ) -> list[Citations]:
64
- """
65
- 1) Obtain CitationChunks via the LLM.
66
- 2) Process each chunk to extract MatchedText using snippet-based index correction.
67
- 3) Calculate overall document coverage and print results.
68
- """
69
- unverified_chunks: CitationChunks = CitationChunks.from_llm(
70
- chatterer=chatterer,
71
- document=document,
72
- fewshot_examples_generator=fewshot_examples_generator,
73
- instruction_generator=instruction_generator,
74
- fewshot_affirmative_response=fewshot_affirmative_response,
75
- )
76
-
77
- verified_chunks: list[Citations] = []
78
- for chunk in unverified_chunks.citation_chunks:
79
- try:
80
- vc: Citations = Citations.from_unverified(
81
- unverified_chunk=chunk,
82
- document=document,
83
- model_and_refinement_steps=(chatterer, num_refinement_steps),
84
- )
85
- verified_chunks.append(vc)
86
- except Exception as e:
87
- logger.error(f"Error processing chunk for subject '{chunk.subject}': {e}")
88
-
89
- if test_global_coverage:
90
- gc = GlobalCoverage.from_verified_citations(verified_chunks, document)
91
- logger.info(f"Global coverage: {gc.coverage * 100:.1f}%")
92
- if gc.coverage < global_coverage_threshold:
93
- logger.info(
94
- f"Global coverage {gc.coverage * 100:.1f}% is below the threshold {global_coverage_threshold * 100:.1f}%."
95
- )
96
- print("=== Final Global Coverage Check ===")
97
- print(f"Overall coverage: {gc.coverage * 100:.1f}% of the document.")
98
- if gc.matched_intervals:
99
- print("Merged matched intervals:")
100
- for interval in gc.matched_intervals:
101
- print(f" - {interval}")
102
- else:
103
- print("No matches found across all chunks.")
104
- print("\n=== Raw Semantic Chunking Result ===")
105
- for vc in verified_chunks:
106
- print(f"{Fore.LIGHTGREEN_EX}[SUBJECT] {Fore.GREEN}{vc.name}{Fore.RESET}")
107
- if vc.references:
108
- for source_key, matches in vc.references.items():
109
- print(f"{Fore.LIGHTBLUE_EX} [SOURCE] {Fore.BLUE}{source_key}{Fore.RESET}")
110
- for mt in matches:
111
- snippet = repr(mt.text)
112
- print(
113
- f" {Fore.LIGHTYELLOW_EX}[MATCH @ {mt.start_idx}~{mt.end_idx}] {Fore.YELLOW}{snippet}{Fore.RESET}"
114
- )
115
- else:
116
- print(" - (No matches found even after refinement.)")
117
-
118
- return verified_chunks
1
+ import logging
2
+ from typing import Callable, NamedTuple, Optional, Self
3
+
4
+ import colorama
5
+ from colorama import Fore
6
+
7
+ from ...language_model import Chatterer
8
+ from .chunks import CitationChunks
9
+ from .citations import Citations
10
+ from .prompt import (
11
+ generate_fewshot_affirmative_response,
12
+ generate_human_assistant_fewshot_examples,
13
+ generate_instruction,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+ colorama.init()
18
+
19
+
20
+ class GlobalCoverage(NamedTuple):
21
+ coverage: float
22
+ matched_intervals: list[tuple[int, int]]
23
+
24
+ @staticmethod
25
+ def merge_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]:
26
+ if not intervals:
27
+ return []
28
+ sorted_intervals = sorted(intervals, key=lambda x: x[0])
29
+ merged: list[tuple[int, int]] = [sorted_intervals[0]]
30
+ for current in sorted_intervals[1:]:
31
+ prev = merged[-1]
32
+ if current[0] <= prev[1]:
33
+ merged[-1] = (prev[0], max(prev[1], current[1]))
34
+ else:
35
+ merged.append(current)
36
+ return merged
37
+
38
+ @classmethod
39
+ def from_verified_citations(cls, verified_chunks: list[Citations], document: str) -> Self:
40
+ all_intervals: list[tuple[int, int]] = []
41
+ for chunk in verified_chunks:
42
+ for matches in chunk.references.values():
43
+ for m in matches:
44
+ all_intervals.append((m.start_idx, m.end_idx))
45
+ merged: list[tuple[int, int]] = cls.merge_intervals(all_intervals)
46
+ doc_length: int = len(document)
47
+ total_matched = sum((e - s for s, e in merged))
48
+ coverage: float = total_matched / doc_length if doc_length > 0 else 0.0
49
+ return cls(coverage=coverage, matched_intervals=merged)
50
+
51
+
52
+ def citation_chunker(
53
+ document: str,
54
+ chatterer: Chatterer,
55
+ global_coverage_threshold: float = 0.9,
56
+ num_refinement_steps: int = 3,
57
+ fewshot_examples_generator: Optional[
58
+ Callable[[], list[tuple[str, str]]]
59
+ ] = generate_human_assistant_fewshot_examples,
60
+ instruction_generator: Optional[Callable[[], str]] = generate_instruction,
61
+ fewshot_affirmative_response: Optional[Callable[[], str]] = generate_fewshot_affirmative_response,
62
+ test_global_coverage: bool = False,
63
+ ) -> list[Citations]:
64
+ """
65
+ 1) Obtain CitationChunks via the LLM.
66
+ 2) Process each chunk to extract MatchedText using snippet-based index correction.
67
+ 3) Calculate overall document coverage and print results.
68
+ """
69
+ unverified_chunks: CitationChunks = CitationChunks.from_llm(
70
+ chatterer=chatterer,
71
+ document=document,
72
+ fewshot_examples_generator=fewshot_examples_generator,
73
+ instruction_generator=instruction_generator,
74
+ fewshot_affirmative_response=fewshot_affirmative_response,
75
+ )
76
+
77
+ verified_chunks: list[Citations] = []
78
+ for chunk in unverified_chunks.citation_chunks:
79
+ try:
80
+ vc: Citations = Citations.from_unverified(
81
+ unverified_chunk=chunk,
82
+ document=document,
83
+ model_and_refinement_steps=(chatterer, num_refinement_steps),
84
+ )
85
+ verified_chunks.append(vc)
86
+ except Exception as e:
87
+ logger.error(f"Error processing chunk for subject '{chunk.subject}': {e}")
88
+
89
+ if test_global_coverage:
90
+ gc = GlobalCoverage.from_verified_citations(verified_chunks, document)
91
+ logger.info(f"Global coverage: {gc.coverage * 100:.1f}%")
92
+ if gc.coverage < global_coverage_threshold:
93
+ logger.info(
94
+ f"Global coverage {gc.coverage * 100:.1f}% is below the threshold {global_coverage_threshold * 100:.1f}%."
95
+ )
96
+ print("=== Final Global Coverage Check ===")
97
+ print(f"Overall coverage: {gc.coverage * 100:.1f}% of the document.")
98
+ if gc.matched_intervals:
99
+ print("Merged matched intervals:")
100
+ for interval in gc.matched_intervals:
101
+ print(f" - {interval}")
102
+ else:
103
+ print("No matches found across all chunks.")
104
+ print("\n=== Raw Semantic Chunking Result ===")
105
+ for vc in verified_chunks:
106
+ print(f"{Fore.LIGHTGREEN_EX}[SUBJECT] {Fore.GREEN}{vc.name}{Fore.RESET}")
107
+ if vc.references:
108
+ for source_key, matches in vc.references.items():
109
+ print(f"{Fore.LIGHTBLUE_EX} [SOURCE] {Fore.BLUE}{source_key}{Fore.RESET}")
110
+ for mt in matches:
111
+ snippet = repr(mt.text)
112
+ print(
113
+ f" {Fore.LIGHTYELLOW_EX}[MATCH @ {mt.start_idx}~{mt.end_idx}] {Fore.YELLOW}{snippet}{Fore.RESET}"
114
+ )
115
+ else:
116
+ print(" - (No matches found even after refinement.)")
117
+
118
+ return verified_chunks