chatterer 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,14 +3,12 @@ import importlib
3
3
  import os
4
4
  import re
5
5
  import site
6
- from contextlib import contextmanager, suppress
7
6
  from fnmatch import fnmatch
8
- from io import BufferedReader, BufferedWriter, BytesIO, StringIO, TextIOWrapper
9
7
  from pathlib import Path
10
8
  from typing import (
11
9
  TYPE_CHECKING,
12
10
  Callable,
13
- Iterator,
11
+ Iterable,
14
12
  NamedTuple,
15
13
  NotRequired,
16
14
  Optional,
@@ -20,6 +18,10 @@ from typing import (
20
18
  TypedDict,
21
19
  )
22
20
 
21
+ from ..common_types.io import PathOrReadable
22
+ from ..utils.bytesio import read_bytes_stream
23
+ from .convert_pdf_to_markdown import extract_text_from_pdf
24
+
23
25
  if TYPE_CHECKING:
24
26
  from bs4 import Tag
25
27
  from openai import OpenAI
@@ -38,20 +40,6 @@ type FileTree = dict[str, Optional[FileTree]]
38
40
 
39
41
  # Type aliases for callback functions and file descriptors
40
42
  CodeLanguageCallback: TypeAlias = Callable[["Tag"], Optional[str]]
41
- FileDescriptorOrPath: TypeAlias = int | str | bytes | os.PathLike[str] | os.PathLike[bytes]
42
-
43
- # Type aliases for different types of IO objects
44
- BytesReadable: TypeAlias = BytesIO | BufferedReader
45
- BytesWritable: TypeAlias = BytesIO | BufferedWriter
46
- StringReadable: TypeAlias = StringIO | TextIOWrapper
47
- StringWritable: TypeAlias = StringIO | TextIOWrapper
48
-
49
- # Combined type aliases for readable and writable objects
50
- Readable: TypeAlias = BytesReadable | StringReadable
51
- Writable: TypeAlias = BytesWritable | StringWritable
52
-
53
- # Type alias for path or readable object
54
- PathOrReadable: TypeAlias = FileDescriptorOrPath | Readable
55
43
 
56
44
 
57
45
  class HtmlToMarkdownOptions(TypedDict):
@@ -135,7 +123,13 @@ class CodeSnippets(NamedTuple):
135
123
  base_dir: Path
136
124
 
137
125
  @classmethod
138
- def from_path_or_pkgname(cls, path_or_pkgname: str, ban_file_patterns: Optional[list[str]] = None) -> Self:
126
+ def from_path_or_pkgname(
127
+ cls,
128
+ path_or_pkgname: str,
129
+ glob_patterns: str | list[str] = "*.py",
130
+ case_sensitive: bool = False,
131
+ ban_file_patterns: Optional[list[str]] = None,
132
+ ) -> Self:
139
133
  """
140
134
  Creates a CodeSnippets instance from a file path or package name.
141
135
 
@@ -146,7 +140,12 @@ class CodeSnippets(NamedTuple):
146
140
  Returns:
147
141
  A new CodeSnippets instance with extracted code snippets.
148
142
  """
149
- paths: list[Path] = _get_pyscript_paths(path_or_pkgname=path_or_pkgname, ban_fn_patterns=ban_file_patterns)
143
+ paths: list[Path] = _get_filepaths(
144
+ path_or_pkgname=path_or_pkgname,
145
+ glob_patterns=glob_patterns,
146
+ case_sensitive=case_sensitive,
147
+ ban_fn_patterns=ban_file_patterns,
148
+ )
150
149
  snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
151
150
  return cls(
152
151
  paths=paths,
@@ -223,7 +222,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
223
222
  return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
224
223
 
225
224
 
226
- def pdf_to_text(path_or_file: PathOrReadable) -> str:
225
+ def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int | None = None) -> str:
227
226
  """
228
227
  Convert a PDF file to plain text.
229
228
 
@@ -231,6 +230,9 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
231
230
 
232
231
  Args:
233
232
  path_or_file: Path to a PDF file or a readable object containing PDF data.
233
+ page_indices: Optional list of page indices to extract. If None, all pages are extracted.
234
+ If an integer is provided, it extracts that specific page.
235
+ If a list is provided, it extracts the specified pages.
234
236
 
235
237
  Returns:
236
238
  str: Extracted text with page markers.
@@ -240,19 +242,14 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
240
242
  """
241
243
  from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
242
244
 
243
- with _open_stream(path_or_file) as stream:
245
+ with read_bytes_stream(path_or_file) as stream:
244
246
  if stream is None:
245
247
  raise FileNotFoundError(path_or_file)
246
- return "\n".join(
247
- f"<!-- Page {page_no} -->\n{text.strip()}\n"
248
- for page_no, text in enumerate(
249
- (
250
- page.get_textpage().extractText() # pyright: ignore[reportUnknownMemberType]
251
- for page in Document(stream=stream.read())
252
- ),
253
- 1,
248
+ with Document(stream=stream.read()) as doc:
249
+ return "\n".join(
250
+ f"<!-- Page {page_no} -->\n{text}\n"
251
+ for page_no, text in extract_text_from_pdf(doc, page_indices).items()
254
252
  )
255
- )
256
253
 
257
254
 
258
255
  def anything_to_markdown(
@@ -400,16 +397,23 @@ def _get_base_dir(target_files: Sequence[Path]) -> Path:
400
397
  return Path(os.path.commonpath(target_files))
401
398
 
402
399
 
403
- def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str]] = None) -> list[Path]:
400
+ def _get_filepaths(
401
+ path_or_pkgname: str,
402
+ glob_patterns: str | list[str] = "*.py",
403
+ case_sensitive: bool = False,
404
+ ban_fn_patterns: Optional[list[str]] = None,
405
+ ) -> list[Path]:
404
406
  """
405
- Gets paths to Python script files from a directory, file, or package name.
407
+ Gets paths to files from a directory, file, or Python package name.
406
408
 
407
- If path_or_pkgname is a directory, finds all .py files recursively.
409
+ If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
408
410
  If it's a file, returns just that file.
409
411
  If it's a package name, imports the package and finds all .py files in its directory.
410
412
 
411
413
  Args:
412
414
  path_or_pkgname: Path to directory/file or package name.
415
+ glob_pattern: Pattern to match files.
416
+ case_sensitive: Whether to match files case-sensitively.
413
417
  ban_fn_patterns: Optional list of patterns to exclude files.
414
418
 
415
419
  Returns:
@@ -418,7 +422,18 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
418
422
  path = Path(path_or_pkgname)
419
423
  pypaths: list[Path]
420
424
  if path.is_dir():
421
- pypaths = list(path.rglob("*.py", case_sensitive=False))
425
+ glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
426
+ pypaths = []
427
+ for pattern in glob_patterns:
428
+ if "**" in pattern:
429
+ regex = _pattern_to_regex(pattern)
430
+ pypaths.extend(
431
+ p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
432
+ )
433
+ else:
434
+ pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
435
+
436
+ # pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
422
437
  elif path.is_file():
423
438
  pypaths = [path]
424
439
  else:
@@ -430,34 +445,3 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
430
445
  if p.is_file()
431
446
  ]
432
447
  return [p for p in pypaths if not ban_fn_patterns or not _is_banned(p, ban_fn_patterns)]
433
-
434
-
435
- @contextmanager
436
- def _open_stream(
437
- path_or_file: PathOrReadable,
438
- ) -> Iterator[Optional[BytesReadable]]:
439
- """
440
- Context manager for opening a file or using an existing stream.
441
-
442
- Handles different types of input (file paths, byte streams, string streams)
443
- and yields a BytesReadable object that can be used to read binary data.
444
-
445
- Args:
446
- path_or_file: File path or readable object.
447
-
448
- Yields:
449
- Optional[BytesReadable]: A readable binary stream or None if opening fails.
450
- """
451
- stream: Optional[BytesReadable] = None
452
- try:
453
- with suppress(BaseException):
454
- if isinstance(path_or_file, BytesReadable):
455
- stream = path_or_file
456
- elif isinstance(path_or_file, StringReadable):
457
- stream = BytesIO(path_or_file.read().encode("utf-8"))
458
- else:
459
- stream = open(path_or_file, "rb")
460
- yield stream
461
- finally:
462
- if stream is not None:
463
- stream.close()