chatterer 0.1.26__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +87 -87
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/constants.py +5 -0
- chatterer/examples/__main__.py +75 -75
- chatterer/examples/any2md.py +83 -85
- chatterer/examples/pdf2md.py +231 -338
- chatterer/examples/pdf2txt.py +52 -54
- chatterer/examples/ppt.py +487 -486
- chatterer/examples/pw.py +141 -143
- chatterer/examples/snippet.py +54 -56
- chatterer/examples/transcribe.py +192 -192
- chatterer/examples/upstage.py +87 -89
- chatterer/examples/web2md.py +80 -80
- chatterer/interactive.py +422 -354
- chatterer/language_model.py +530 -536
- chatterer/messages.py +21 -21
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +388 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +51 -53
- chatterer/tools/citation_chunking/citation_chunker.py +117 -118
- chatterer/tools/citation_chunking/citations.py +284 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +636 -645
- chatterer/tools/convert_to_text.py +446 -446
- chatterer/tools/upstage_document_parser.py +704 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -147
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +349 -350
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +145 -145
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/METADATA +377 -390
- chatterer-0.1.27.dist-info/RECORD +43 -0
- chatterer-0.1.26.dist-info/RECORD +0 -42
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/WHEEL +0 -0
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/entry_points.txt +0 -0
- {chatterer-0.1.26.dist-info → chatterer-0.1.27.dist-info}/top_level.txt +0 -0
@@ -1,446 +1,446 @@
|
|
1
|
-
import ast
|
2
|
-
import importlib
|
3
|
-
import os
|
4
|
-
import re
|
5
|
-
import site
|
6
|
-
from fnmatch import fnmatch
|
7
|
-
from pathlib import Path
|
8
|
-
from typing import (
|
9
|
-
TYPE_CHECKING,
|
10
|
-
Callable,
|
11
|
-
NamedTuple,
|
12
|
-
NotRequired,
|
13
|
-
Optional,
|
14
|
-
Self,
|
15
|
-
Sequence,
|
16
|
-
TypeAlias,
|
17
|
-
TypedDict,
|
18
|
-
)
|
19
|
-
|
20
|
-
from ..common_types.io import PathOrReadable
|
21
|
-
from ..utils.bytesio import read_bytes_stream
|
22
|
-
from .convert_pdf_to_markdown import PageIndexType, extract_text_from_pdf
|
23
|
-
|
24
|
-
if TYPE_CHECKING:
|
25
|
-
from bs4 import Tag
|
26
|
-
from openai import OpenAI
|
27
|
-
from requests import Response, Session
|
28
|
-
|
29
|
-
try:
|
30
|
-
from tiktoken import get_encoding, list_encoding_names
|
31
|
-
|
32
|
-
enc = get_encoding(list_encoding_names()[-1])
|
33
|
-
except ImportError:
|
34
|
-
enc = None
|
35
|
-
|
36
|
-
|
37
|
-
# Type definition for representing a file tree structure
|
38
|
-
type FileTree = dict[str, Optional[FileTree]]
|
39
|
-
|
40
|
-
# Type aliases for callback functions and file descriptors
|
41
|
-
CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
|
42
|
-
|
43
|
-
|
44
|
-
class HtmlToMarkdownOptions(TypedDict):
|
45
|
-
"""
|
46
|
-
TypedDict for options used in HTML to Markdown conversion.
|
47
|
-
|
48
|
-
Contains various configuration options for controlling how HTML is converted to Markdown,
|
49
|
-
including formatting preferences, escape behaviors, and styling options.
|
50
|
-
"""
|
51
|
-
|
52
|
-
autolinks: NotRequired[bool]
|
53
|
-
bullets: NotRequired[str]
|
54
|
-
code_language: NotRequired[str]
|
55
|
-
code_language_callback: NotRequired[CodeLanguageCallback]
|
56
|
-
convert: NotRequired[list[str]]
|
57
|
-
default_title: NotRequired[bool]
|
58
|
-
escape_asterisks: NotRequired[bool]
|
59
|
-
escape_underscores: NotRequired[bool]
|
60
|
-
escape_misc: NotRequired[bool]
|
61
|
-
heading_style: NotRequired[str]
|
62
|
-
keep_inline_images_in: NotRequired[list[str]]
|
63
|
-
newline_style: NotRequired[str]
|
64
|
-
strip: NotRequired[list[str]]
|
65
|
-
strip_document: NotRequired[str]
|
66
|
-
strong_em_symbol: NotRequired[str]
|
67
|
-
sub_symbol: NotRequired[str]
|
68
|
-
sup_symbol: NotRequired[str]
|
69
|
-
table_infer_header: NotRequired[bool]
|
70
|
-
wrap: NotRequired[bool]
|
71
|
-
wrap_width: NotRequired[int]
|
72
|
-
|
73
|
-
|
74
|
-
def get_default_html_to_markdown_options() -> HtmlToMarkdownOptions:
|
75
|
-
"""
|
76
|
-
Returns the default options for HTML to Markdown conversion.
|
77
|
-
|
78
|
-
This function provides a set of sensible defaults for the markdownify library,
|
79
|
-
including settings for bullets, escaping, heading styles, and other formatting options.
|
80
|
-
|
81
|
-
Returns:
|
82
|
-
HtmlToMarkdownOptions: A dictionary of default conversion options.
|
83
|
-
"""
|
84
|
-
from markdownify import ( # pyright: ignore[reportUnknownVariableType, reportMissingTypeStubs]
|
85
|
-
ASTERISK,
|
86
|
-
SPACES,
|
87
|
-
STRIP,
|
88
|
-
UNDERLINED,
|
89
|
-
)
|
90
|
-
|
91
|
-
return {
|
92
|
-
"autolinks": True,
|
93
|
-
"bullets": "*+-", # An iterable of bullet types.
|
94
|
-
"code_language": "",
|
95
|
-
"default_title": False,
|
96
|
-
"escape_asterisks": True,
|
97
|
-
"escape_underscores": True,
|
98
|
-
"escape_misc": False,
|
99
|
-
"heading_style": UNDERLINED,
|
100
|
-
"keep_inline_images_in": [],
|
101
|
-
"newline_style": SPACES,
|
102
|
-
"strip_document": STRIP,
|
103
|
-
"strong_em_symbol": ASTERISK,
|
104
|
-
"sub_symbol": "",
|
105
|
-
"sup_symbol": "",
|
106
|
-
"table_infer_header": False,
|
107
|
-
"wrap": False,
|
108
|
-
"wrap_width": 80,
|
109
|
-
}
|
110
|
-
|
111
|
-
|
112
|
-
class CodeSnippets(NamedTuple):
|
113
|
-
"""
|
114
|
-
A named tuple that represents code snippets extracted from Python files.
|
115
|
-
|
116
|
-
Contains the paths to the files, the concatenated text of all snippets,
|
117
|
-
and the base directory of the files.
|
118
|
-
"""
|
119
|
-
|
120
|
-
paths: list[Path]
|
121
|
-
snippets_text: str
|
122
|
-
base_dir: Path
|
123
|
-
|
124
|
-
@classmethod
|
125
|
-
def from_path_or_pkgname(
|
126
|
-
cls,
|
127
|
-
path_or_pkgname: str,
|
128
|
-
glob_patterns: str | list[str] = "*.py",
|
129
|
-
case_sensitive: bool = False,
|
130
|
-
ban_file_patterns: Optional[list[str]] = None,
|
131
|
-
) -> Self:
|
132
|
-
"""
|
133
|
-
Creates a CodeSnippets instance from a file path or package name.
|
134
|
-
|
135
|
-
Args:
|
136
|
-
path_or_pkgname: Path to a file/directory or a Python package name.
|
137
|
-
ban_file_patterns: Optional list of patterns to exclude files.
|
138
|
-
|
139
|
-
Returns:
|
140
|
-
A new CodeSnippets instance with extracted code snippets.
|
141
|
-
"""
|
142
|
-
paths: list[Path] = _get_filepaths(
|
143
|
-
path_or_pkgname=path_or_pkgname,
|
144
|
-
glob_patterns=glob_patterns,
|
145
|
-
case_sensitive=case_sensitive,
|
146
|
-
ban_fn_patterns=ban_file_patterns,
|
147
|
-
)
|
148
|
-
snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
|
149
|
-
return cls(
|
150
|
-
paths=paths,
|
151
|
-
snippets_text=snippets_text,
|
152
|
-
base_dir=_get_base_dir(paths),
|
153
|
-
)
|
154
|
-
|
155
|
-
@property
|
156
|
-
def metadata(self) -> str:
|
157
|
-
"""
|
158
|
-
Generates metadata about the code snippets.
|
159
|
-
|
160
|
-
Returns a string containing information about the file tree structure,
|
161
|
-
total number of files, tokens (if tiktoken is available), and lines.
|
162
|
-
|
163
|
-
Returns:
|
164
|
-
str: Formatted metadata string.
|
165
|
-
"""
|
166
|
-
file_paths: list[Path] = self.paths
|
167
|
-
text: str = self.snippets_text
|
168
|
-
|
169
|
-
base_dir: Path = _get_base_dir(file_paths)
|
170
|
-
results: list[str] = [base_dir.as_posix()]
|
171
|
-
|
172
|
-
file_tree: FileTree = {}
|
173
|
-
for file_path in sorted(file_paths):
|
174
|
-
rel_path = file_path.relative_to(base_dir)
|
175
|
-
subtree: Optional[FileTree] = file_tree
|
176
|
-
for part in rel_path.parts[:-1]:
|
177
|
-
if subtree is not None:
|
178
|
-
subtree = subtree.setdefault(part, {})
|
179
|
-
if subtree is not None:
|
180
|
-
subtree[rel_path.parts[-1]] = None
|
181
|
-
|
182
|
-
def _display_tree(tree: FileTree, prefix: str = "") -> None:
|
183
|
-
"""
|
184
|
-
Helper function to recursively display a file tree structure.
|
185
|
-
|
186
|
-
Args:
|
187
|
-
tree: The file tree dictionary to display.
|
188
|
-
prefix: Current line prefix for proper indentation.
|
189
|
-
"""
|
190
|
-
items: list[tuple[str, Optional[FileTree]]] = sorted(tree.items())
|
191
|
-
count: int = len(items)
|
192
|
-
for idx, (name, subtree) in enumerate(items):
|
193
|
-
branch: str = "└── " if idx == count - 1 else "├── "
|
194
|
-
results.append(f"{prefix}{branch}{name}")
|
195
|
-
if subtree is not None:
|
196
|
-
extension: str = " " if idx == count - 1 else "│ "
|
197
|
-
_display_tree(tree=subtree, prefix=prefix + extension)
|
198
|
-
|
199
|
-
_display_tree(file_tree)
|
200
|
-
results.append(f"- Total files: {len(file_paths)}")
|
201
|
-
if enc is not None:
|
202
|
-
num_tokens: int = len(enc.encode(text, disallowed_special=()))
|
203
|
-
results.append(f"- Total tokens: {num_tokens}")
|
204
|
-
results.append(f"- Total lines: {text.count('\n') + 1}")
|
205
|
-
return "\n".join(results)
|
206
|
-
|
207
|
-
|
208
|
-
def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str:
|
209
|
-
"""
|
210
|
-
Convert HTML content to Markdown using the provided options.
|
211
|
-
|
212
|
-
Args:
|
213
|
-
html (str): HTML content to convert.
|
214
|
-
options (HtmlToMarkdownOptions): Options for the conversion.
|
215
|
-
|
216
|
-
Returns:
|
217
|
-
str: The Markdown content.
|
218
|
-
"""
|
219
|
-
from markdownify import markdownify # pyright: ignore[reportUnknownVariableType, reportMissingTypeStubs]
|
220
|
-
|
221
|
-
return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
|
222
|
-
|
223
|
-
|
224
|
-
def pdf_to_text(path_or_file: PathOrReadable, page_indices: Optional[PageIndexType] = None) -> str:
|
225
|
-
"""
|
226
|
-
Convert a PDF file to plain text.
|
227
|
-
|
228
|
-
Extracts text from each page of a PDF file and formats it with page markers.
|
229
|
-
|
230
|
-
Args:
|
231
|
-
path_or_file: Path to a PDF file or a readable object containing PDF data.
|
232
|
-
page_indices: Optional list of page indices to extract. If None, all pages are extracted.
|
233
|
-
If an integer is provided, it extracts that specific page.
|
234
|
-
If a list is provided, it extracts the specified pages.
|
235
|
-
|
236
|
-
Returns:
|
237
|
-
str: Extracted text with page markers.
|
238
|
-
|
239
|
-
Raises:
|
240
|
-
FileNotFoundError: If the file cannot be found or opened.
|
241
|
-
"""
|
242
|
-
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
243
|
-
|
244
|
-
with read_bytes_stream(path_or_file) as stream:
|
245
|
-
if stream is None:
|
246
|
-
raise FileNotFoundError(path_or_file)
|
247
|
-
with Document(stream=stream.read()) as doc:
|
248
|
-
return "\n".join(
|
249
|
-
f"<!-- Page {page_no} -->\n{text}\n"
|
250
|
-
for page_no, text in extract_text_from_pdf(doc=doc, page_indices=page_indices).items()
|
251
|
-
)
|
252
|
-
|
253
|
-
|
254
|
-
def anything_to_markdown(
|
255
|
-
source: "str | Response | Path",
|
256
|
-
requests_session: Optional["Session"] = None,
|
257
|
-
llm_client: Optional["OpenAI"] = None,
|
258
|
-
llm_model: Optional[str] = None,
|
259
|
-
style_map: Optional[str] = None,
|
260
|
-
exiftool_path: Optional[str] = None,
|
261
|
-
docintel_endpoint: Optional[str] = None,
|
262
|
-
) -> str:
|
263
|
-
"""
|
264
|
-
Convert various types of content to Markdown format.
|
265
|
-
|
266
|
-
Uses the MarkItDown library to convert different types of content (URLs, files, API responses)
|
267
|
-
to Markdown format.
|
268
|
-
|
269
|
-
Args:
|
270
|
-
source: The source content to convert (URL string, Response object, or Path).
|
271
|
-
requests_session: Optional requests Session for HTTP requests.
|
272
|
-
llm_client: Optional OpenAI client for LLM-based conversions.
|
273
|
-
llm_model: Optional model name for the LLM.
|
274
|
-
style_map: Optional style mapping configuration.
|
275
|
-
exiftool_path: Optional path to exiftool for metadata extraction.
|
276
|
-
docintel_endpoint: Optional Document Intelligence API endpoint.
|
277
|
-
|
278
|
-
Returns:
|
279
|
-
str: The converted Markdown content.
|
280
|
-
"""
|
281
|
-
from markitdown import MarkItDown
|
282
|
-
|
283
|
-
result = MarkItDown(
|
284
|
-
requests_session=requests_session,
|
285
|
-
llm_client=llm_client,
|
286
|
-
llm_model=llm_model,
|
287
|
-
style_map=style_map,
|
288
|
-
exiftool_path=exiftool_path,
|
289
|
-
docintel_endpoint=docintel_endpoint,
|
290
|
-
).convert(source)
|
291
|
-
return result.text_content
|
292
|
-
|
293
|
-
|
294
|
-
# Alias for CodeSnippets.from_path_or_pkgname for backward compatibility
|
295
|
-
pyscripts_to_snippets = CodeSnippets.from_path_or_pkgname
|
296
|
-
|
297
|
-
|
298
|
-
def _pattern_to_regex(pattern: str) -> re.Pattern[str]:
|
299
|
-
"""
|
300
|
-
Converts an fnmatch pattern to a regular expression.
|
301
|
-
|
302
|
-
In this function, '**' is converted to match any character including directory separators.
|
303
|
-
The remaining '*' matches any character except directory separators, and '?' matches a single character.
|
304
|
-
|
305
|
-
Args:
|
306
|
-
pattern: The fnmatch pattern to convert.
|
307
|
-
|
308
|
-
Returns:
|
309
|
-
A compiled regular expression pattern.
|
310
|
-
"""
|
311
|
-
# First escape the pattern
|
312
|
-
pattern = re.escape(pattern)
|
313
|
-
# Convert '**' to match any character including directory separators ('.*')
|
314
|
-
pattern = pattern.replace(r"\*\*", ".*")
|
315
|
-
# Then convert single '*' to match any character except directory separators
|
316
|
-
pattern = pattern.replace(r"\*", "[^/]*")
|
317
|
-
# Convert '?' to match a single character
|
318
|
-
pattern = pattern.replace(r"\?", ".")
|
319
|
-
# Anchor the pattern to start and end
|
320
|
-
pattern = "^" + pattern + "$"
|
321
|
-
return re.compile(pattern)
|
322
|
-
|
323
|
-
|
324
|
-
def _is_banned(p: Path, ban_patterns: list[str]) -> bool:
|
325
|
-
"""
|
326
|
-
Checks if a given path matches any of the ban patterns.
|
327
|
-
|
328
|
-
Determines if the path p matches any pattern in ban_patterns using either
|
329
|
-
fnmatch-based or recursive patterns (i.e., containing '**').
|
330
|
-
|
331
|
-
Note: Patterns should use POSIX-style paths (i.e., '/' separators).
|
332
|
-
|
333
|
-
Args:
|
334
|
-
p: The path to check.
|
335
|
-
ban_patterns: List of patterns to match against.
|
336
|
-
|
337
|
-
Returns:
|
338
|
-
bool: True if the path matches any ban pattern, False otherwise.
|
339
|
-
"""
|
340
|
-
p_str = p.as_posix()
|
341
|
-
for pattern in ban_patterns:
|
342
|
-
if "**" in pattern:
|
343
|
-
regex = _pattern_to_regex(pattern)
|
344
|
-
if regex.match(p_str):
|
345
|
-
return True
|
346
|
-
else:
|
347
|
-
# Simple fnmatch: '*' by default doesn't match '/'
|
348
|
-
if fnmatch(p_str, pattern):
|
349
|
-
return True
|
350
|
-
return False
|
351
|
-
|
352
|
-
|
353
|
-
def _get_a_snippet(fpath: Path) -> str:
|
354
|
-
"""
|
355
|
-
Extracts a code snippet from a Python file.
|
356
|
-
|
357
|
-
Reads the file, parses it as Python code, and returns a formatted code snippet
|
358
|
-
with the relative path as a header in markdown code block format.
|
359
|
-
|
360
|
-
Args:
|
361
|
-
fpath: Path to the Python file.
|
362
|
-
|
363
|
-
Returns:
|
364
|
-
str: Formatted code snippet or empty string if the file doesn't exist.
|
365
|
-
"""
|
366
|
-
if not fpath.is_file():
|
367
|
-
return ""
|
368
|
-
|
369
|
-
cleaned_code: str = "\n".join(
|
370
|
-
line for line in ast.unparse(ast.parse(fpath.read_text(encoding="utf-8"))).splitlines()
|
371
|
-
)
|
372
|
-
if site_dir := next(
|
373
|
-
(d for d in reversed(site.getsitepackages()) if fpath.is_relative_to(d)),
|
374
|
-
None,
|
375
|
-
):
|
376
|
-
display_path = fpath.relative_to(site_dir)
|
377
|
-
elif fpath.is_relative_to(cwd := Path.cwd()):
|
378
|
-
display_path = fpath.relative_to(cwd)
|
379
|
-
else:
|
380
|
-
display_path = fpath.absolute()
|
381
|
-
return f"```{display_path}\n{cleaned_code}\n```\n\n"
|
382
|
-
|
383
|
-
|
384
|
-
def _get_base_dir(target_files: Sequence[Path]) -> Path:
|
385
|
-
"""
|
386
|
-
Determines the common base directory for a sequence of file paths.
|
387
|
-
|
388
|
-
Finds the directory with the shortest path that is a parent to at least one file.
|
389
|
-
|
390
|
-
Args:
|
391
|
-
target_files: Sequence of file paths.
|
392
|
-
|
393
|
-
Returns:
|
394
|
-
Path: The common base directory.
|
395
|
-
"""
|
396
|
-
return Path(os.path.commonpath(target_files))
|
397
|
-
|
398
|
-
|
399
|
-
def _get_filepaths(
|
400
|
-
path_or_pkgname: str,
|
401
|
-
glob_patterns: str | list[str] = "*.py",
|
402
|
-
case_sensitive: bool = False,
|
403
|
-
ban_fn_patterns: Optional[list[str]] = None,
|
404
|
-
) -> list[Path]:
|
405
|
-
"""
|
406
|
-
Gets paths to files from a directory, file, or Python package name.
|
407
|
-
|
408
|
-
If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
|
409
|
-
If it's a file, returns just that file.
|
410
|
-
If it's a package name, imports the package and finds all .py files in its directory.
|
411
|
-
|
412
|
-
Args:
|
413
|
-
path_or_pkgname: Path to directory/file or package name.
|
414
|
-
glob_pattern: Pattern to match files.
|
415
|
-
case_sensitive: Whether to match files case-sensitively.
|
416
|
-
ban_fn_patterns: Optional list of patterns to exclude files.
|
417
|
-
|
418
|
-
Returns:
|
419
|
-
list[Path]: List of paths to Python files.
|
420
|
-
"""
|
421
|
-
path = Path(path_or_pkgname)
|
422
|
-
pypaths: list[Path]
|
423
|
-
if path.is_dir():
|
424
|
-
glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
|
425
|
-
pypaths = []
|
426
|
-
for pattern in glob_patterns:
|
427
|
-
if "**" in pattern:
|
428
|
-
regex = _pattern_to_regex(pattern)
|
429
|
-
pypaths.extend(
|
430
|
-
p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
|
431
|
-
)
|
432
|
-
else:
|
433
|
-
pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
|
434
|
-
|
435
|
-
# pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
|
436
|
-
elif path.is_file():
|
437
|
-
pypaths = [path]
|
438
|
-
else:
|
439
|
-
pypaths = [
|
440
|
-
p
|
441
|
-
for p in Path(next(iter(importlib.import_module(path_or_pkgname).__path__))).rglob(
|
442
|
-
"*.py", case_sensitive=False
|
443
|
-
)
|
444
|
-
if p.is_file()
|
445
|
-
]
|
446
|
-
return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
|
1
|
+
import ast
|
2
|
+
import importlib
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
import site
|
6
|
+
from fnmatch import fnmatch
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import (
|
9
|
+
TYPE_CHECKING,
|
10
|
+
Callable,
|
11
|
+
NamedTuple,
|
12
|
+
NotRequired,
|
13
|
+
Optional,
|
14
|
+
Self,
|
15
|
+
Sequence,
|
16
|
+
TypeAlias,
|
17
|
+
TypedDict,
|
18
|
+
)
|
19
|
+
|
20
|
+
from ..common_types.io import PathOrReadable
|
21
|
+
from ..utils.bytesio import read_bytes_stream
|
22
|
+
from .convert_pdf_to_markdown import PageIndexType, extract_text_from_pdf
|
23
|
+
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from bs4 import Tag
|
26
|
+
from openai import OpenAI
|
27
|
+
from requests import Response, Session
|
28
|
+
|
29
|
+
try:
|
30
|
+
from tiktoken import get_encoding, list_encoding_names
|
31
|
+
|
32
|
+
enc = get_encoding(list_encoding_names()[-1])
|
33
|
+
except ImportError:
|
34
|
+
enc = None
|
35
|
+
|
36
|
+
|
37
|
+
# Type definition for representing a file tree structure
|
38
|
+
type FileTree = dict[str, Optional[FileTree]]
|
39
|
+
|
40
|
+
# Type aliases for callback functions and file descriptors
|
41
|
+
CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
|
42
|
+
|
43
|
+
|
44
|
+
class HtmlToMarkdownOptions(TypedDict):
|
45
|
+
"""
|
46
|
+
TypedDict for options used in HTML to Markdown conversion.
|
47
|
+
|
48
|
+
Contains various configuration options for controlling how HTML is converted to Markdown,
|
49
|
+
including formatting preferences, escape behaviors, and styling options.
|
50
|
+
"""
|
51
|
+
|
52
|
+
autolinks: NotRequired[bool]
|
53
|
+
bullets: NotRequired[str]
|
54
|
+
code_language: NotRequired[str]
|
55
|
+
code_language_callback: NotRequired[CodeLanguageCallback]
|
56
|
+
convert: NotRequired[list[str]]
|
57
|
+
default_title: NotRequired[bool]
|
58
|
+
escape_asterisks: NotRequired[bool]
|
59
|
+
escape_underscores: NotRequired[bool]
|
60
|
+
escape_misc: NotRequired[bool]
|
61
|
+
heading_style: NotRequired[str]
|
62
|
+
keep_inline_images_in: NotRequired[list[str]]
|
63
|
+
newline_style: NotRequired[str]
|
64
|
+
strip: NotRequired[list[str]]
|
65
|
+
strip_document: NotRequired[str]
|
66
|
+
strong_em_symbol: NotRequired[str]
|
67
|
+
sub_symbol: NotRequired[str]
|
68
|
+
sup_symbol: NotRequired[str]
|
69
|
+
table_infer_header: NotRequired[bool]
|
70
|
+
wrap: NotRequired[bool]
|
71
|
+
wrap_width: NotRequired[int]
|
72
|
+
|
73
|
+
|
74
|
+
def get_default_html_to_markdown_options() -> HtmlToMarkdownOptions:
|
75
|
+
"""
|
76
|
+
Returns the default options for HTML to Markdown conversion.
|
77
|
+
|
78
|
+
This function provides a set of sensible defaults for the markdownify library,
|
79
|
+
including settings for bullets, escaping, heading styles, and other formatting options.
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
HtmlToMarkdownOptions: A dictionary of default conversion options.
|
83
|
+
"""
|
84
|
+
from markdownify import ( # pyright: ignore[reportUnknownVariableType, reportMissingTypeStubs]
|
85
|
+
ASTERISK,
|
86
|
+
SPACES,
|
87
|
+
STRIP,
|
88
|
+
UNDERLINED,
|
89
|
+
)
|
90
|
+
|
91
|
+
return {
|
92
|
+
"autolinks": True,
|
93
|
+
"bullets": "*+-", # An iterable of bullet types.
|
94
|
+
"code_language": "",
|
95
|
+
"default_title": False,
|
96
|
+
"escape_asterisks": True,
|
97
|
+
"escape_underscores": True,
|
98
|
+
"escape_misc": False,
|
99
|
+
"heading_style": UNDERLINED,
|
100
|
+
"keep_inline_images_in": [],
|
101
|
+
"newline_style": SPACES,
|
102
|
+
"strip_document": STRIP,
|
103
|
+
"strong_em_symbol": ASTERISK,
|
104
|
+
"sub_symbol": "",
|
105
|
+
"sup_symbol": "",
|
106
|
+
"table_infer_header": False,
|
107
|
+
"wrap": False,
|
108
|
+
"wrap_width": 80,
|
109
|
+
}
|
110
|
+
|
111
|
+
|
112
|
+
class CodeSnippets(NamedTuple):
|
113
|
+
"""
|
114
|
+
A named tuple that represents code snippets extracted from Python files.
|
115
|
+
|
116
|
+
Contains the paths to the files, the concatenated text of all snippets,
|
117
|
+
and the base directory of the files.
|
118
|
+
"""
|
119
|
+
|
120
|
+
paths: list[Path]
|
121
|
+
snippets_text: str
|
122
|
+
base_dir: Path
|
123
|
+
|
124
|
+
@classmethod
|
125
|
+
def from_path_or_pkgname(
|
126
|
+
cls,
|
127
|
+
path_or_pkgname: str,
|
128
|
+
glob_patterns: str | list[str] = "*.py",
|
129
|
+
case_sensitive: bool = False,
|
130
|
+
ban_file_patterns: Optional[list[str]] = None,
|
131
|
+
) -> Self:
|
132
|
+
"""
|
133
|
+
Creates a CodeSnippets instance from a file path or package name.
|
134
|
+
|
135
|
+
Args:
|
136
|
+
path_or_pkgname: Path to a file/directory or a Python package name.
|
137
|
+
ban_file_patterns: Optional list of patterns to exclude files.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
A new CodeSnippets instance with extracted code snippets.
|
141
|
+
"""
|
142
|
+
paths: list[Path] = _get_filepaths(
|
143
|
+
path_or_pkgname=path_or_pkgname,
|
144
|
+
glob_patterns=glob_patterns,
|
145
|
+
case_sensitive=case_sensitive,
|
146
|
+
ban_fn_patterns=ban_file_patterns,
|
147
|
+
)
|
148
|
+
snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
|
149
|
+
return cls(
|
150
|
+
paths=paths,
|
151
|
+
snippets_text=snippets_text,
|
152
|
+
base_dir=_get_base_dir(paths),
|
153
|
+
)
|
154
|
+
|
155
|
+
@property
|
156
|
+
def metadata(self) -> str:
|
157
|
+
"""
|
158
|
+
Generates metadata about the code snippets.
|
159
|
+
|
160
|
+
Returns a string containing information about the file tree structure,
|
161
|
+
total number of files, tokens (if tiktoken is available), and lines.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
str: Formatted metadata string.
|
165
|
+
"""
|
166
|
+
file_paths: list[Path] = self.paths
|
167
|
+
text: str = self.snippets_text
|
168
|
+
|
169
|
+
base_dir: Path = _get_base_dir(file_paths)
|
170
|
+
results: list[str] = [base_dir.as_posix()]
|
171
|
+
|
172
|
+
file_tree: FileTree = {}
|
173
|
+
for file_path in sorted(file_paths):
|
174
|
+
rel_path = file_path.relative_to(base_dir)
|
175
|
+
subtree: Optional[FileTree] = file_tree
|
176
|
+
for part in rel_path.parts[:-1]:
|
177
|
+
if subtree is not None:
|
178
|
+
subtree = subtree.setdefault(part, {})
|
179
|
+
if subtree is not None:
|
180
|
+
subtree[rel_path.parts[-1]] = None
|
181
|
+
|
182
|
+
def _display_tree(tree: FileTree, prefix: str = "") -> None:
|
183
|
+
"""
|
184
|
+
Helper function to recursively display a file tree structure.
|
185
|
+
|
186
|
+
Args:
|
187
|
+
tree: The file tree dictionary to display.
|
188
|
+
prefix: Current line prefix for proper indentation.
|
189
|
+
"""
|
190
|
+
items: list[tuple[str, Optional[FileTree]]] = sorted(tree.items())
|
191
|
+
count: int = len(items)
|
192
|
+
for idx, (name, subtree) in enumerate(items):
|
193
|
+
branch: str = "└── " if idx == count - 1 else "├── "
|
194
|
+
results.append(f"{prefix}{branch}{name}")
|
195
|
+
if subtree is not None:
|
196
|
+
extension: str = " " if idx == count - 1 else "│ "
|
197
|
+
_display_tree(tree=subtree, prefix=prefix + extension)
|
198
|
+
|
199
|
+
_display_tree(file_tree)
|
200
|
+
results.append(f"- Total files: {len(file_paths)}")
|
201
|
+
if enc is not None:
|
202
|
+
num_tokens: int = len(enc.encode(text, disallowed_special=()))
|
203
|
+
results.append(f"- Total tokens: {num_tokens}")
|
204
|
+
results.append(f"- Total lines: {text.count('\n') + 1}")
|
205
|
+
return "\n".join(results)
|
206
|
+
|
207
|
+
|
208
|
+
def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str:
|
209
|
+
"""
|
210
|
+
Convert HTML content to Markdown using the provided options.
|
211
|
+
|
212
|
+
Args:
|
213
|
+
html (str): HTML content to convert.
|
214
|
+
options (HtmlToMarkdownOptions): Options for the conversion.
|
215
|
+
|
216
|
+
Returns:
|
217
|
+
str: The Markdown content.
|
218
|
+
"""
|
219
|
+
from markdownify import markdownify # pyright: ignore[reportUnknownVariableType, reportMissingTypeStubs]
|
220
|
+
|
221
|
+
return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
|
222
|
+
|
223
|
+
|
224
|
+
def pdf_to_text(path_or_file: PathOrReadable, page_indices: Optional[PageIndexType] = None) -> str:
|
225
|
+
"""
|
226
|
+
Convert a PDF file to plain text.
|
227
|
+
|
228
|
+
Extracts text from each page of a PDF file and formats it with page markers.
|
229
|
+
|
230
|
+
Args:
|
231
|
+
path_or_file: Path to a PDF file or a readable object containing PDF data.
|
232
|
+
page_indices: Optional list of page indices to extract. If None, all pages are extracted.
|
233
|
+
If an integer is provided, it extracts that specific page.
|
234
|
+
If a list is provided, it extracts the specified pages.
|
235
|
+
|
236
|
+
Returns:
|
237
|
+
str: Extracted text with page markers.
|
238
|
+
|
239
|
+
Raises:
|
240
|
+
FileNotFoundError: If the file cannot be found or opened.
|
241
|
+
"""
|
242
|
+
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
243
|
+
|
244
|
+
with read_bytes_stream(path_or_file) as stream:
|
245
|
+
if stream is None:
|
246
|
+
raise FileNotFoundError(path_or_file)
|
247
|
+
with Document(stream=stream.read()) as doc:
|
248
|
+
return "\n".join(
|
249
|
+
f"<!-- Page {page_no} -->\n{text}\n"
|
250
|
+
for page_no, text in extract_text_from_pdf(doc=doc, page_indices=page_indices).items()
|
251
|
+
)
|
252
|
+
|
253
|
+
|
254
|
+
def anything_to_markdown(
|
255
|
+
source: "str | Response | Path",
|
256
|
+
requests_session: Optional["Session"] = None,
|
257
|
+
llm_client: Optional["OpenAI"] = None,
|
258
|
+
llm_model: Optional[str] = None,
|
259
|
+
style_map: Optional[str] = None,
|
260
|
+
exiftool_path: Optional[str] = None,
|
261
|
+
docintel_endpoint: Optional[str] = None,
|
262
|
+
) -> str:
|
263
|
+
"""
|
264
|
+
Convert various types of content to Markdown format.
|
265
|
+
|
266
|
+
Uses the MarkItDown library to convert different types of content (URLs, files, API responses)
|
267
|
+
to Markdown format.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
source: The source content to convert (URL string, Response object, or Path).
|
271
|
+
requests_session: Optional requests Session for HTTP requests.
|
272
|
+
llm_client: Optional OpenAI client for LLM-based conversions.
|
273
|
+
llm_model: Optional model name for the LLM.
|
274
|
+
style_map: Optional style mapping configuration.
|
275
|
+
exiftool_path: Optional path to exiftool for metadata extraction.
|
276
|
+
docintel_endpoint: Optional Document Intelligence API endpoint.
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
str: The converted Markdown content.
|
280
|
+
"""
|
281
|
+
from markitdown import MarkItDown
|
282
|
+
|
283
|
+
result = MarkItDown(
|
284
|
+
requests_session=requests_session,
|
285
|
+
llm_client=llm_client,
|
286
|
+
llm_model=llm_model,
|
287
|
+
style_map=style_map,
|
288
|
+
exiftool_path=exiftool_path,
|
289
|
+
docintel_endpoint=docintel_endpoint,
|
290
|
+
).convert(source)
|
291
|
+
return result.text_content
|
292
|
+
|
293
|
+
|
294
|
+
# Alias for CodeSnippets.from_path_or_pkgname for backward compatibility
|
295
|
+
pyscripts_to_snippets = CodeSnippets.from_path_or_pkgname
|
296
|
+
|
297
|
+
|
298
|
+
def _pattern_to_regex(pattern: str) -> re.Pattern[str]:
|
299
|
+
"""
|
300
|
+
Converts an fnmatch pattern to a regular expression.
|
301
|
+
|
302
|
+
In this function, '**' is converted to match any character including directory separators.
|
303
|
+
The remaining '*' matches any character except directory separators, and '?' matches a single character.
|
304
|
+
|
305
|
+
Args:
|
306
|
+
pattern: The fnmatch pattern to convert.
|
307
|
+
|
308
|
+
Returns:
|
309
|
+
A compiled regular expression pattern.
|
310
|
+
"""
|
311
|
+
# First escape the pattern
|
312
|
+
pattern = re.escape(pattern)
|
313
|
+
# Convert '**' to match any character including directory separators ('.*')
|
314
|
+
pattern = pattern.replace(r"\*\*", ".*")
|
315
|
+
# Then convert single '*' to match any character except directory separators
|
316
|
+
pattern = pattern.replace(r"\*", "[^/]*")
|
317
|
+
# Convert '?' to match a single character
|
318
|
+
pattern = pattern.replace(r"\?", ".")
|
319
|
+
# Anchor the pattern to start and end
|
320
|
+
pattern = "^" + pattern + "$"
|
321
|
+
return re.compile(pattern)
|
322
|
+
|
323
|
+
|
324
|
+
def _is_banned(p: Path, ban_patterns: list[str]) -> bool:
|
325
|
+
"""
|
326
|
+
Checks if a given path matches any of the ban patterns.
|
327
|
+
|
328
|
+
Determines if the path p matches any pattern in ban_patterns using either
|
329
|
+
fnmatch-based or recursive patterns (i.e., containing '**').
|
330
|
+
|
331
|
+
Note: Patterns should use POSIX-style paths (i.e., '/' separators).
|
332
|
+
|
333
|
+
Args:
|
334
|
+
p: The path to check.
|
335
|
+
ban_patterns: List of patterns to match against.
|
336
|
+
|
337
|
+
Returns:
|
338
|
+
bool: True if the path matches any ban pattern, False otherwise.
|
339
|
+
"""
|
340
|
+
p_str = p.as_posix()
|
341
|
+
for pattern in ban_patterns:
|
342
|
+
if "**" in pattern:
|
343
|
+
regex = _pattern_to_regex(pattern)
|
344
|
+
if regex.match(p_str):
|
345
|
+
return True
|
346
|
+
else:
|
347
|
+
# Simple fnmatch: '*' by default doesn't match '/'
|
348
|
+
if fnmatch(p_str, pattern):
|
349
|
+
return True
|
350
|
+
return False
|
351
|
+
|
352
|
+
|
353
|
+
def _get_a_snippet(fpath: Path) -> str:
|
354
|
+
"""
|
355
|
+
Extracts a code snippet from a Python file.
|
356
|
+
|
357
|
+
Reads the file, parses it as Python code, and returns a formatted code snippet
|
358
|
+
with the relative path as a header in markdown code block format.
|
359
|
+
|
360
|
+
Args:
|
361
|
+
fpath: Path to the Python file.
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
str: Formatted code snippet or empty string if the file doesn't exist.
|
365
|
+
"""
|
366
|
+
if not fpath.is_file():
|
367
|
+
return ""
|
368
|
+
|
369
|
+
cleaned_code: str = "\n".join(
|
370
|
+
line for line in ast.unparse(ast.parse(fpath.read_text(encoding="utf-8"))).splitlines()
|
371
|
+
)
|
372
|
+
if site_dir := next(
|
373
|
+
(d for d in reversed(site.getsitepackages()) if fpath.is_relative_to(d)),
|
374
|
+
None,
|
375
|
+
):
|
376
|
+
display_path = fpath.relative_to(site_dir)
|
377
|
+
elif fpath.is_relative_to(cwd := Path.cwd()):
|
378
|
+
display_path = fpath.relative_to(cwd)
|
379
|
+
else:
|
380
|
+
display_path = fpath.absolute()
|
381
|
+
return f"```{display_path}\n{cleaned_code}\n```\n\n"
|
382
|
+
|
383
|
+
|
384
|
+
def _get_base_dir(target_files: Sequence[Path]) -> Path:
|
385
|
+
"""
|
386
|
+
Determines the common base directory for a sequence of file paths.
|
387
|
+
|
388
|
+
Finds the directory with the shortest path that is a parent to at least one file.
|
389
|
+
|
390
|
+
Args:
|
391
|
+
target_files: Sequence of file paths.
|
392
|
+
|
393
|
+
Returns:
|
394
|
+
Path: The common base directory.
|
395
|
+
"""
|
396
|
+
return Path(os.path.commonpath(target_files))
|
397
|
+
|
398
|
+
|
399
|
+
def _get_filepaths(
|
400
|
+
path_or_pkgname: str,
|
401
|
+
glob_patterns: str | list[str] = "*.py",
|
402
|
+
case_sensitive: bool = False,
|
403
|
+
ban_fn_patterns: Optional[list[str]] = None,
|
404
|
+
) -> list[Path]:
|
405
|
+
"""
|
406
|
+
Gets paths to files from a directory, file, or Python package name.
|
407
|
+
|
408
|
+
If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
|
409
|
+
If it's a file, returns just that file.
|
410
|
+
If it's a package name, imports the package and finds all .py files in its directory.
|
411
|
+
|
412
|
+
Args:
|
413
|
+
path_or_pkgname: Path to directory/file or package name.
|
414
|
+
glob_pattern: Pattern to match files.
|
415
|
+
case_sensitive: Whether to match files case-sensitively.
|
416
|
+
ban_fn_patterns: Optional list of patterns to exclude files.
|
417
|
+
|
418
|
+
Returns:
|
419
|
+
list[Path]: List of paths to Python files.
|
420
|
+
"""
|
421
|
+
path = Path(path_or_pkgname)
|
422
|
+
pypaths: list[Path]
|
423
|
+
if path.is_dir():
|
424
|
+
glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
|
425
|
+
pypaths = []
|
426
|
+
for pattern in glob_patterns:
|
427
|
+
if "**" in pattern:
|
428
|
+
regex = _pattern_to_regex(pattern)
|
429
|
+
pypaths.extend(
|
430
|
+
p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
|
431
|
+
)
|
432
|
+
else:
|
433
|
+
pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
|
434
|
+
|
435
|
+
# pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
|
436
|
+
elif path.is_file():
|
437
|
+
pypaths = [path]
|
438
|
+
else:
|
439
|
+
pypaths = [
|
440
|
+
p
|
441
|
+
for p in Path(next(iter(importlib.import_module(path_or_pkgname).__path__))).rglob(
|
442
|
+
"*.py", case_sensitive=False
|
443
|
+
)
|
444
|
+
if p.is_file()
|
445
|
+
]
|
446
|
+
return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
|