chatterer 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ """
2
+ ragent/prompt/citation_chunking.py
3
+
4
+ This module defines prompt constants for citation chunking.
5
+ The LLM is expected to return JSON objects that include only the text snippets for the beginning and end of the citation span.
6
+ The character indices will be computed in a post‐processing step.
7
+ """
8
+
9
+ from functools import cache
10
+
11
+
12
+ @cache
13
+ def generate_instruction() -> str:
14
+ from .chunks import CitationChunk, CitationChunks
15
+ from .reference import (
16
+ MultiMatchRegex,
17
+ SingleMatchCitation,
18
+ )
19
+
20
+ return (
21
+ "You are an AI specialized in 'citation-based text chunking'.\n"
22
+ "Given a document, perform the following steps:\n"
23
+ "1) Identify the major topics in the document.\n"
24
+ "2) For each topic, provide a list of citation objects indicating the text snippets at the beginning and end of the relevant paragraph(s) for that topic.\n\n"
25
+ "Important:\n"
26
+ "- Return citation objects with 'start_text' and 'end_text' fields to precisely capture the text span. Do NOT include character indices.\n"
27
+ "- If a regular expression based matching is more appropriate for a topic (e.g. for multiple matches), you may include a regex object of type 'multi_match_regex'.\n\n"
28
+ "Return JSON strictly in the following format:\n"
29
+ "{json_example}\n\n"
30
+ "1) Return only valid JSON (no extra keys).\n"
31
+ "2) Do NOT include any commentary.\n"
32
+ "3) Ensure that the citations capture the entire relevant paragraph without overlap or omission."
33
+ ).format(
34
+ json_example=CitationChunks(
35
+ citation_chunks=[
36
+ CitationChunk(
37
+ subject="Quantum Advantage",
38
+ references=[
39
+ SingleMatchCitation(
40
+ start_from="Starting snippet...",
41
+ end_at="... Ending snippet",
42
+ ),
43
+ MultiMatchRegex(
44
+ type="multi_match_regex",
45
+ regular_expression="Some.*?regex.*?pattern",
46
+ ),
47
+ ],
48
+ ),
49
+ ]
50
+ ).model_dump_json(indent=2)
51
+ )
52
+
53
+
54
+ @cache
55
+ def generate_human_assistant_fewshot_examples() -> list[tuple[str, str]]:
56
+ from .chunks import CitationChunk, CitationChunks
57
+ from .reference import SingleMatchCitation
58
+
59
+ return [
60
+ (
61
+ "Agent-Semantic Chunking of the following text:\n\n"
62
+ "Title: Revolutionary Breakthrough in Quantum Computing\n\n"
63
+ "In a landmark development, researchers at the National Quantum Laboratory unveiled a quantum computer "
64
+ "that demonstrates clear quantum advantage by performing computations that are infeasible on classical systems.\n\n"
65
+ "The breakthrough is the result of years of rigorous research and international collaboration. "
66
+ "The system leverages entanglement and superposition to process complex algorithms at unprecedented speeds.\n\n"
67
+ "However, practical applications are still emerging, and experts caution about scalability challenges. "
68
+ "Meanwhile, several tech giants are expressing keen interest in integrating quantum technology into future products.\n\n"
69
+ "Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
70
+ CitationChunks(
71
+ citation_chunks=[
72
+ CitationChunk(
73
+ subject="Quantum Advantage",
74
+ references=[
75
+ SingleMatchCitation(
76
+ start_from="In a landmark development",
77
+ end_at="on classical systems.",
78
+ ),
79
+ ],
80
+ ),
81
+ CitationChunk(
82
+ subject="Research Collaboration",
83
+ references=[
84
+ SingleMatchCitation(
85
+ start_from="The breakthrough is the result",
86
+ end_at="unprecedented speeds.",
87
+ ),
88
+ ],
89
+ ),
90
+ CitationChunk(
91
+ subject="Practical Challenges",
92
+ references=[
93
+ SingleMatchCitation(
94
+ start_from="However, practical applications",
95
+ end_at="scalability challenges.",
96
+ ),
97
+ ],
98
+ ),
99
+ CitationChunk(
100
+ subject="Industry Interest",
101
+ references=[
102
+ SingleMatchCitation(
103
+ start_from="Meanwhile, several tech giants",
104
+ end_at="future products.",
105
+ ),
106
+ ],
107
+ ),
108
+ ]
109
+ ).model_dump_json(indent=2),
110
+ ),
111
+ (
112
+ "Agent-Semantic Chunking of the following text:\n\n"
113
+ "Title: Rising Seas and Coastal Erosion: A Global Crisis\n\n"
114
+ "Communities worldwide face the impacts of climate change as rising sea levels lead to accelerated coastal erosion, "
115
+ "jeopardizing homes and critical infrastructure.\n\n"
116
+ 'In a small coastal town, residents noted that "the encroaching sea" has already begun to claim beachfront properties, '
117
+ "prompting local authorities to implement emergency measures.\n\n"
118
+ "Environmental experts warn that without significant intervention, the frequency and severity of these events will increase, "
119
+ "further exacerbating the global climate crisis.\n\n"
120
+ "Please classify the major topics and return the exact text snippets (for the start and end of the relevant paragraphs) for each topic.",
121
+ CitationChunks(
122
+ citation_chunks=[
123
+ CitationChunk(
124
+ subject="Coastal Erosion Impact",
125
+ references=[
126
+ SingleMatchCitation(
127
+ start_from="Communities worldwide face the impacts",
128
+ end_at="critical infrastructure.",
129
+ ),
130
+ ],
131
+ ),
132
+ CitationChunk(
133
+ subject="Local Emergency Response",
134
+ references=[
135
+ SingleMatchCitation(
136
+ start_from="In a small coastal town",
137
+ end_at="emergency measures.",
138
+ ),
139
+ ],
140
+ ),
141
+ CitationChunk(
142
+ subject="Expert Warning",
143
+ references=[
144
+ SingleMatchCitation(
145
+ start_from="Environmental experts warn",
146
+ end_at="global climate crisis.",
147
+ ),
148
+ ],
149
+ ),
150
+ ]
151
+ ).model_dump_json(indent=2),
152
+ ),
153
+ ]
154
+
155
+
156
+ def generate_fewshot_affirmative_response() -> str:
157
+ return "Great! I will now perform the citation-based chunking. Please provide the document to process!"
@@ -0,0 +1,26 @@
1
+ from typing import Literal, TypeAlias
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class MultiMatchRegex(BaseModel):
7
+ type: Literal["multi_match_regex"] = Field(
8
+ description="A regex pattern that should match multiple instances of the subject in the document."
9
+ )
10
+ regular_expression: str = Field(
11
+ description="The regex pattern that should match multiple instances of the subject in the document."
12
+ )
13
+
14
+ def __hash__(self) -> int:
15
+ return hash((self.type, self.regular_expression))
16
+
17
+
18
+ class SingleMatchCitation(BaseModel):
19
+ start_from: str = Field(description="A snippet of text at the beginning of the cited section.")
20
+ end_at: str = Field(description="A snippet of text at the end of the cited section.")
21
+
22
+ def __hash__(self) -> int:
23
+ return hash((self.start_from, self.end_at))
24
+
25
+
26
+ Reference: TypeAlias = SingleMatchCitation | MultiMatchRegex
@@ -0,0 +1,138 @@
1
+ from typing import Callable, NamedTuple, Self, TypeVar
2
+
3
+ from pydantic import BaseModel
4
+
5
+ T = TypeVar("T", bound=BaseModel)
6
+
7
+
8
+ class MatchedText(NamedTuple):
9
+ text: str
10
+ start_idx: int
11
+ end_idx: int
12
+
13
+ @classmethod
14
+ def from_text(
15
+ cls,
16
+ full_text: str,
17
+ len_func: Callable[[str], int],
18
+ chunk_size: int = 2048,
19
+ token_overlap: int = 0,
20
+ separator: str = "\n",
21
+ ) -> list[Self]:
22
+ """
23
+ 토큰 수 제한과 선택적 오버랩을 기준으로 텍스트를 청크로 분할합니다.
24
+ 각 청크는 원본 텍스트 내의 위치 정보 (start_idx, end_idx)와 함께 반환됩니다.
25
+ 텍스트는 separator 문자열로 분할하며, 토큰 수는 len_func 함수를 통해 계산합니다.
26
+
27
+ Args:
28
+ full_text: 분할할 전체 텍스트.
29
+ len_func: 주어진 텍스트의 토큰 수를 반환하는 함수.
30
+ chunk_size: 각 청크의 최대 토큰 수. 기본값은 2048.
31
+ token_overlap: 청크 간 중첩할 토큰 수. 기본값은 0.
32
+ separator: 텍스트를 분할할 구분자 문자열. 기본값은 "\n".
33
+
34
+ Returns:
35
+ 각 요소가 (chunk_text, start_idx, end_idx)인 튜플의 리스트.
36
+ chunk_text는 whole_text 내에서 whole_text[start_idx:end_idx]와 동일한 부분 문자열입니다.
37
+ """
38
+ text_chunks: list[Self] = []
39
+ sep_token_count: int = len_func(separator)
40
+ sep_len = len(separator)
41
+
42
+ # 먼저, separator를 기준으로 원본 텍스트를 분할하되 각 조각의 시작/종료 인덱스를 기록합니다.
43
+ piece_infos: list[Self] = [] # 각 튜플: (piece_text, start_index, end_index)
44
+ start_idx = 0
45
+ while True:
46
+ idx = full_text.find(separator, start_idx)
47
+ if idx == -1:
48
+ # 마지막 조각: separator가 더 이상 없으므로 전체 남은 부분을 추가합니다.
49
+ piece_infos.append(
50
+ cls(
51
+ text=full_text[start_idx:],
52
+ start_idx=start_idx,
53
+ end_idx=len(full_text),
54
+ )
55
+ )
56
+ break
57
+ else:
58
+ piece_infos.append(
59
+ cls(
60
+ text=full_text[start_idx:idx],
61
+ start_idx=start_idx,
62
+ end_idx=idx,
63
+ )
64
+ )
65
+ start_idx = idx + sep_len
66
+
67
+ current_chunk: list[Self] = []
68
+ current_token_count: int = 0
69
+ i = 0
70
+ while i < len(piece_infos):
71
+ piece_info = piece_infos[i]
72
+ piece = piece_info.text
73
+ piece_start = piece_info.start_idx
74
+ piece_end = piece_info.end_idx
75
+ # 원래 코드는 각 조각에 separator의 토큰 수도 포함합니다.
76
+ piece_token_count: int = len_func(piece) + sep_token_count
77
+
78
+ # 현재 청크에 추가하면 chunk_size를 초과하는 경우
79
+ if current_token_count + piece_token_count > chunk_size:
80
+ # 단일 조각이 chunk_size보다 큰 경우엔 어쩔 수 없이 추가합니다.
81
+ if not current_chunk:
82
+ current_chunk.append(
83
+ cls(
84
+ text=piece,
85
+ start_idx=piece_start,
86
+ end_idx=piece_end,
87
+ )
88
+ )
89
+ current_token_count += piece_token_count
90
+ i += 1
91
+ # 현재 청크 완성 → 청크에 추가
92
+ chunk_start = current_chunk[0].start_idx
93
+ # current_chunk에 담긴 조각들은 원본 텍스트상 연속되어 있으므로,
94
+ # 청크의 종료 인덱스는 마지막 조각의 end_index가 됩니다.
95
+ chunk_end = current_chunk[-1].end_idx
96
+ # 원본 텍스트의 해당 구간을 그대로 추출하면 separator가 포함됩니다.
97
+ chunk_text = full_text[chunk_start:chunk_end]
98
+ text_chunks.append(
99
+ cls(
100
+ text=chunk_text,
101
+ start_idx=chunk_start,
102
+ end_idx=chunk_end,
103
+ )
104
+ )
105
+
106
+ # token_overlap이 적용되는 경우: 청크 끝부분 일부를 다음 청크에 오버랩합니다.
107
+ if token_overlap > 0:
108
+ overlap_chunk: list[Self] = []
109
+ overlap_count: int = 0
110
+ # 뒤에서부터 역순으로 오버랩할 조각들을 선택합니다.
111
+ for j in range(len(current_chunk) - 1, -1, -1):
112
+ p_text = current_chunk[j].text
113
+ p_token_count = len_func(p_text) + sep_token_count
114
+ # 최소 한 조각은 포함하고, 오버랩 토큰 수가 token_overlap 이하라면 계속 추가
115
+ if overlap_count + p_token_count <= token_overlap or not overlap_chunk:
116
+ overlap_chunk.insert(0, current_chunk[j])
117
+ overlap_count += p_token_count
118
+ else:
119
+ break
120
+ current_chunk = overlap_chunk.copy()
121
+ current_token_count = overlap_count
122
+ else:
123
+ current_chunk.clear()
124
+ current_token_count = 0
125
+ else:
126
+ # 청크에 추가 후 다음 조각 진행
127
+ current_chunk.append(cls(text=piece, start_idx=piece_start, end_idx=piece_end))
128
+ current_token_count += piece_token_count
129
+ i += 1
130
+
131
+ # 남은 조각이 있다면 마지막 청크로 추가합니다.
132
+ if current_chunk:
133
+ chunk_start = current_chunk[0].start_idx
134
+ chunk_end = current_chunk[-1].end_idx
135
+ chunk_text = full_text[chunk_start:chunk_end]
136
+ text_chunks.append(cls(text=chunk_text, start_idx=chunk_start, end_idx=chunk_end))
137
+
138
+ return text_chunks
@@ -57,10 +57,11 @@ PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
57
57
  class HtmlToMarkdownOptions(TypedDict):
58
58
  """
59
59
  TypedDict for options used in HTML to Markdown conversion.
60
-
60
+
61
61
  Contains various configuration options for controlling how HTML is converted to Markdown,
62
62
  including formatting preferences, escape behaviors, and styling options.
63
63
  """
64
+
64
65
  autolinks: NotRequired[bool]
65
66
  bullets: NotRequired[str]
66
67
  code_language: NotRequired[str]
@@ -86,10 +87,10 @@ class HtmlToMarkdownOptions(TypedDict):
86
87
  def get_default_html_to_markdown_options() -> HtmlToMarkdownOptions:
87
88
  """
88
89
  Returns the default options for HTML to Markdown conversion.
89
-
90
+
90
91
  This function provides a set of sensible defaults for the markdownify library,
91
92
  including settings for bullets, escaping, heading styles, and other formatting options.
92
-
93
+
93
94
  Returns:
94
95
  HtmlToMarkdownOptions: A dictionary of default conversion options.
95
96
  """
@@ -124,10 +125,11 @@ def get_default_html_to_markdown_options() -> HtmlToMarkdownOptions:
124
125
  class CodeSnippets(NamedTuple):
125
126
  """
126
127
  A named tuple that represents code snippets extracted from Python files.
127
-
128
+
128
129
  Contains the paths to the files, the concatenated text of all snippets,
129
130
  and the base directory of the files.
130
131
  """
132
+
131
133
  paths: list[Path]
132
134
  snippets_text: str
133
135
  base_dir: Path
@@ -136,11 +138,11 @@ class CodeSnippets(NamedTuple):
136
138
  def from_path_or_pkgname(cls, path_or_pkgname: str, ban_file_patterns: Optional[list[str]] = None) -> Self:
137
139
  """
138
140
  Creates a CodeSnippets instance from a file path or package name.
139
-
141
+
140
142
  Args:
141
143
  path_or_pkgname: Path to a file/directory or a Python package name.
142
144
  ban_file_patterns: Optional list of patterns to exclude files.
143
-
145
+
144
146
  Returns:
145
147
  A new CodeSnippets instance with extracted code snippets.
146
148
  """
@@ -156,10 +158,10 @@ class CodeSnippets(NamedTuple):
156
158
  def metadata(self) -> str:
157
159
  """
158
160
  Generates metadata about the code snippets.
159
-
161
+
160
162
  Returns a string containing information about the file tree structure,
161
163
  total number of files, tokens (if tiktoken is available), and lines.
162
-
164
+
163
165
  Returns:
164
166
  str: Formatted metadata string.
165
167
  """
@@ -182,7 +184,7 @@ class CodeSnippets(NamedTuple):
182
184
  def _display_tree(tree: FileTree, prefix: str = "") -> None:
183
185
  """
184
186
  Helper function to recursively display a file tree structure.
185
-
187
+
186
188
  Args:
187
189
  tree: The file tree dictionary to display.
188
190
  prefix: Current line prefix for proper indentation.
@@ -224,15 +226,15 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
224
226
  def pdf_to_text(path_or_file: PathOrReadable) -> str:
225
227
  """
226
228
  Convert a PDF file to plain text.
227
-
229
+
228
230
  Extracts text from each page of a PDF file and formats it with page markers.
229
-
231
+
230
232
  Args:
231
233
  path_or_file: Path to a PDF file or a readable object containing PDF data.
232
-
234
+
233
235
  Returns:
234
236
  str: Extracted text with page markers.
235
-
237
+
236
238
  Raises:
237
239
  FileNotFoundError: If the file cannot be found or opened.
238
240
  """
@@ -264,10 +266,10 @@ def anything_to_markdown(
264
266
  ) -> str:
265
267
  """
266
268
  Convert various types of content to Markdown format.
267
-
269
+
268
270
  Uses the MarkItDown library to convert different types of content (URLs, files, API responses)
269
271
  to Markdown format.
270
-
272
+
271
273
  Args:
272
274
  source: The source content to convert (URL string, Response object, or Path).
273
275
  requests_session: Optional requests Session for HTTP requests.
@@ -276,7 +278,7 @@ def anything_to_markdown(
276
278
  style_map: Optional style mapping configuration.
277
279
  exiftool_path: Optional path to exiftool for metadata extraction.
278
280
  docintel_endpoint: Optional Document Intelligence API endpoint.
279
-
281
+
280
282
  Returns:
281
283
  str: The converted Markdown content.
282
284
  """
@@ -300,13 +302,13 @@ pyscripts_to_snippets = CodeSnippets.from_path_or_pkgname
300
302
  def _pattern_to_regex(pattern: str) -> re.Pattern[str]:
301
303
  """
302
304
  Converts an fnmatch pattern to a regular expression.
303
-
305
+
304
306
  In this function, '**' is converted to match any character including directory separators.
305
307
  The remaining '*' matches any character except directory separators, and '?' matches a single character.
306
-
308
+
307
309
  Args:
308
310
  pattern: The fnmatch pattern to convert.
309
-
311
+
310
312
  Returns:
311
313
  A compiled regular expression pattern.
312
314
  """
@@ -326,16 +328,16 @@ def _pattern_to_regex(pattern: str) -> re.Pattern[str]:
326
328
  def _is_banned(p: Path, ban_patterns: list[str]) -> bool:
327
329
  """
328
330
  Checks if a given path matches any of the ban patterns.
329
-
330
- Determines if the path p matches any pattern in ban_patterns using either
331
+
332
+ Determines if the path p matches any pattern in ban_patterns using either
331
333
  fnmatch-based or recursive patterns (i.e., containing '**').
332
-
334
+
333
335
  Note: Patterns should use POSIX-style paths (i.e., '/' separators).
334
-
336
+
335
337
  Args:
336
338
  p: The path to check.
337
339
  ban_patterns: List of patterns to match against.
338
-
340
+
339
341
  Returns:
340
342
  bool: True if the path matches any ban pattern, False otherwise.
341
343
  """
@@ -355,13 +357,13 @@ def _is_banned(p: Path, ban_patterns: list[str]) -> bool:
355
357
  def _get_a_snippet(fpath: Path) -> str:
356
358
  """
357
359
  Extracts a code snippet from a Python file.
358
-
360
+
359
361
  Reads the file, parses it as Python code, and returns a formatted code snippet
360
362
  with the relative path as a header in markdown code block format.
361
-
363
+
362
364
  Args:
363
365
  fpath: Path to the Python file.
364
-
366
+
365
367
  Returns:
366
368
  str: Formatted code snippet or empty string if the file doesn't exist.
367
369
  """
@@ -386,33 +388,30 @@ def _get_a_snippet(fpath: Path) -> str:
386
388
  def _get_base_dir(target_files: Sequence[Path]) -> Path:
387
389
  """
388
390
  Determines the common base directory for a sequence of file paths.
389
-
391
+
390
392
  Finds the directory with the shortest path that is a parent to at least one file.
391
-
393
+
392
394
  Args:
393
395
  target_files: Sequence of file paths.
394
-
396
+
395
397
  Returns:
396
398
  Path: The common base directory.
397
399
  """
398
- return sorted(
399
- {file_path.parent for file_path in target_files},
400
- key=lambda p: len(p.parts),
401
- )[0]
400
+ return Path(os.path.commonpath(target_files))
402
401
 
403
402
 
404
403
  def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str]] = None) -> list[Path]:
405
404
  """
406
405
  Gets paths to Python script files from a directory, file, or package name.
407
-
406
+
408
407
  If path_or_pkgname is a directory, finds all .py files recursively.
409
408
  If it's a file, returns just that file.
410
409
  If it's a package name, imports the package and finds all .py files in its directory.
411
-
410
+
412
411
  Args:
413
412
  path_or_pkgname: Path to directory/file or package name.
414
413
  ban_fn_patterns: Optional list of patterns to exclude files.
415
-
414
+
416
415
  Returns:
417
416
  list[Path]: List of paths to Python files.
418
417
  """
@@ -430,7 +429,7 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
430
429
  )
431
430
  if p.is_file()
432
431
  ]
433
- return [p for p in pypaths if ban_fn_patterns and not _is_banned(p, ban_fn_patterns)]
432
+ return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
434
433
 
435
434
 
436
435
  @contextmanager
@@ -439,13 +438,13 @@ def _open_stream(
439
438
  ) -> Iterator[Optional[BytesReadable]]:
440
439
  """
441
440
  Context manager for opening a file or using an existing stream.
442
-
441
+
443
442
  Handles different types of input (file paths, byte streams, string streams)
444
443
  and yields a BytesReadable object that can be used to read binary data.
445
-
444
+
446
445
  Args:
447
446
  path_or_file: File path or readable object.
448
-
447
+
449
448
  Yields:
450
449
  Optional[BytesReadable]: A readable binary stream or None if opening fails.
451
450
  """
@@ -30,6 +30,7 @@ import playwright.async_api
30
30
  import playwright.sync_api
31
31
 
32
32
  from ...language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
33
+ from ...utils.image import Base64Image, get_default_image_processing_config
33
34
  from ..convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
34
35
  from .utils import (
35
36
  DEFAULT_UA,
@@ -41,7 +42,6 @@ from .utils import (
41
42
  SelectedLineRanges,
42
43
  WaitUntil,
43
44
  aget_image_url_and_markdown_links,
44
- get_default_image_processing_config,
45
45
  get_default_playwright_launch_options,
46
46
  get_image_url_and_markdown_links,
47
47
  replace_images,
@@ -392,6 +392,8 @@ Markdown-formatted webpage content is provided below for your reference:
392
392
  timeout: Union[float, int] = 8,
393
393
  keep_page: bool = False,
394
394
  referer: Optional[str] = None,
395
+ describe_images: bool = True,
396
+ filter: bool = True,
395
397
  ) -> str:
396
398
  """
397
399
  Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
@@ -409,6 +411,8 @@ Markdown-formatted webpage content is provided below for your reference:
409
411
  timeout (float | int): Navigation timeout (in seconds).
410
412
  keep_page (bool): If True, do not close the page after processing.
411
413
  referer (Optional[str]): Referer URL to set.
414
+ describe_images (bool): If True, describe images in the Markdown text.
415
+ filter (bool): If True, filter the important lines using the language model.
412
416
 
413
417
  Returns:
414
418
  str: Filtered Markdown containing only the important lines.
@@ -423,7 +427,10 @@ Markdown-formatted webpage content is provided below for your reference:
423
427
  keep_page=keep_page,
424
428
  referer=referer,
425
429
  )
426
- markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
430
+ if describe_images:
431
+ markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
432
+ if not filter:
433
+ return markdown_content
427
434
  lines = markdown_content.split("\n")
428
435
  line_length = len(lines)
429
436
  important_lines: set[int] = set()
@@ -465,6 +472,8 @@ Markdown-formatted webpage content is provided below for your reference:
465
472
  timeout: Union[float, int] = 8,
466
473
  keep_page: bool = False,
467
474
  referer: Optional[str] = None,
475
+ describe_images: bool = True,
476
+ filter: bool = True,
468
477
  ) -> str:
469
478
  """
470
479
  Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
@@ -483,6 +492,8 @@ Markdown-formatted webpage content is provided below for your reference:
483
492
  timeout (float | int): Navigation timeout (in seconds).
484
493
  keep_page (bool): If True, do not close the page after processing.
485
494
  referer (Optional[str]): Referer URL to set.
495
+ describe_images (bool): If True, describe images in the Markdown text.
496
+ filter (bool): If True, filter the important lines using the language model.
486
497
 
487
498
  Returns:
488
499
  str: Filtered Markdown containing only the important lines.
@@ -497,7 +508,10 @@ Markdown-formatted webpage content is provided below for your reference:
497
508
  keep_page=keep_page,
498
509
  referer=referer,
499
510
  )
500
- markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
511
+ if describe_images:
512
+ markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
513
+ if not filter:
514
+ return markdown_content
501
515
  lines = markdown_content.split("\n")
502
516
  line_length = len(lines)
503
517
  important_lines: set[int] = set()
@@ -529,10 +543,12 @@ Markdown-formatted webpage content is provided below for your reference:
529
543
  """
530
544
  Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
531
545
  """
532
- image_url_and_markdown_links: dict[Optional[str], list[MarkdownLink]] = get_image_url_and_markdown_links(
533
- markdown_text=markdown_text,
534
- headers=self.headers | {"Referer": referer_url},
535
- config=self.image_processing_config,
546
+ image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = (
547
+ get_image_url_and_markdown_links(
548
+ markdown_text=markdown_text,
549
+ headers=self.headers | {"Referer": referer_url},
550
+ config=self.image_processing_config,
551
+ )
536
552
  )
537
553
 
538
554
  image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
@@ -540,7 +556,7 @@ Markdown-formatted webpage content is provided below for your reference:
540
556
  if image_url is not None:
541
557
  try:
542
558
  image_summary: str = self.chatterer.describe_image(
543
- image_url=image_url,
559
+ image_url=image_url.data_uri,
544
560
  instruction=self.image_description_instruction,
545
561
  )
546
562
  except Exception:
@@ -560,7 +576,9 @@ Markdown-formatted webpage content is provided below for your reference:
560
576
  """
561
577
  Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
562
578
  """
563
- image_url_and_markdown_links: dict[Optional[str], list[MarkdownLink]] = await aget_image_url_and_markdown_links(
579
+ image_url_and_markdown_links: dict[
580
+ Optional[Base64Image], list[MarkdownLink]
581
+ ] = await aget_image_url_and_markdown_links(
564
582
  markdown_text=markdown_text,
565
583
  headers=self.headers | {"Referer": referer_url},
566
584
  config=self.image_processing_config,
@@ -576,7 +594,7 @@ Markdown-formatted webpage content is provided below for your reference:
576
594
  return True
577
595
 
578
596
  coros: list[Awaitable[Optional[str]]] = [
579
- self.chatterer.adescribe_image(image_url=image_url, instruction=self.image_description_instruction)
597
+ self.chatterer.adescribe_image(image_url=image_url.data_uri, instruction=self.image_description_instruction)
580
598
  if image_url is not None
581
599
  else dummy()
582
600
  for image_url in image_url_and_markdown_links.keys()