chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from pathlib import Path
8
8
  from typing import (
9
9
  TYPE_CHECKING,
10
10
  Callable,
11
+ Iterable,
11
12
  NamedTuple,
12
13
  NotRequired,
13
14
  Optional,
@@ -19,6 +20,7 @@ from typing import (
19
20
 
20
21
  from ..common_types.io import PathOrReadable
21
22
  from ..utils.bytesio import read_bytes_stream
23
+ from .convert_pdf_to_markdown import extract_text_from_pdf
22
24
 
23
25
  if TYPE_CHECKING:
24
26
  from bs4 import Tag
@@ -121,7 +123,13 @@ class CodeSnippets(NamedTuple):
121
123
  base_dir: Path
122
124
 
123
125
  @classmethod
124
- def from_path_or_pkgname(cls, path_or_pkgname: str, ban_file_patterns: Optional[list[str]] = None) -> Self:
126
+ def from_path_or_pkgname(
127
+ cls,
128
+ path_or_pkgname: str,
129
+ glob_patterns: str | list[str] = "*.py",
130
+ case_sensitive: bool = False,
131
+ ban_file_patterns: Optional[list[str]] = None,
132
+ ) -> Self:
125
133
  """
126
134
  Creates a CodeSnippets instance from a file path or package name.
127
135
 
@@ -132,7 +140,12 @@ class CodeSnippets(NamedTuple):
132
140
  Returns:
133
141
  A new CodeSnippets instance with extracted code snippets.
134
142
  """
135
- paths: list[Path] = _get_pyscript_paths(path_or_pkgname=path_or_pkgname, ban_fn_patterns=ban_file_patterns)
143
+ paths: list[Path] = _get_filepaths(
144
+ path_or_pkgname=path_or_pkgname,
145
+ glob_patterns=glob_patterns,
146
+ case_sensitive=case_sensitive,
147
+ ban_fn_patterns=ban_file_patterns,
148
+ )
136
149
  snippets_text: str = "".join(_get_a_snippet(p) for p in paths)
137
150
  return cls(
138
151
  paths=paths,
@@ -209,7 +222,7 @@ def html_to_markdown(html: str, options: Optional[HtmlToMarkdownOptions]) -> str
209
222
  return str(markdownify(html, **(options or {}))) # pyright: ignore[reportUnknownArgumentType]
210
223
 
211
224
 
212
- def pdf_to_text(path_or_file: PathOrReadable) -> str:
225
+ def pdf_to_text(path_or_file: PathOrReadable, page_indices: Iterable[int] | int | None = None) -> str:
213
226
  """
214
227
  Convert a PDF file to plain text.
215
228
 
@@ -217,6 +230,9 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
217
230
 
218
231
  Args:
219
232
  path_or_file: Path to a PDF file or a readable object containing PDF data.
233
+ page_indices: Optional list of page indices to extract. If None, all pages are extracted.
234
+ If an integer is provided, it extracts that specific page.
235
+ If a list is provided, it extracts the specified pages.
220
236
 
221
237
  Returns:
222
238
  str: Extracted text with page markers.
@@ -229,16 +245,11 @@ def pdf_to_text(path_or_file: PathOrReadable) -> str:
229
245
  with read_bytes_stream(path_or_file) as stream:
230
246
  if stream is None:
231
247
  raise FileNotFoundError(path_or_file)
232
- return "\n".join(
233
- f"<!-- Page {page_no} -->\n{text.strip()}\n"
234
- for page_no, text in enumerate(
235
- (
236
- page.get_textpage().extractText() # pyright: ignore[reportUnknownMemberType]
237
- for page in Document(stream=stream.read())
238
- ),
239
- 1,
248
+ with Document(stream=stream.read()) as doc:
249
+ return "\n".join(
250
+ f"<!-- Page {page_no} -->\n{text}\n"
251
+ for page_no, text in extract_text_from_pdf(doc, page_indices).items()
240
252
  )
241
- )
242
253
 
243
254
 
244
255
  def anything_to_markdown(
@@ -386,16 +397,23 @@ def _get_base_dir(target_files: Sequence[Path]) -> Path:
386
397
  return Path(os.path.commonpath(target_files))
387
398
 
388
399
 
389
- def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str]] = None) -> list[Path]:
400
+ def _get_filepaths(
401
+ path_or_pkgname: str,
402
+ glob_patterns: str | list[str] = "*.py",
403
+ case_sensitive: bool = False,
404
+ ban_fn_patterns: Optional[list[str]] = None,
405
+ ) -> list[Path]:
390
406
  """
391
- Gets paths to Python script files from a directory, file, or package name.
407
+ Gets paths to files from a directory, file, or Python package name.
392
408
 
393
- If path_or_pkgname is a directory, finds all .py files recursively.
409
+ If path_or_pkgname is a directory, finds all `glob_pattern` matching files recursively.
394
410
  If it's a file, returns just that file.
395
411
  If it's a package name, imports the package and finds all .py files in its directory.
396
412
 
397
413
  Args:
398
414
  path_or_pkgname: Path to directory/file or package name.
415
+ glob_pattern: Pattern to match files.
416
+ case_sensitive: Whether to match files case-sensitively.
399
417
  ban_fn_patterns: Optional list of patterns to exclude files.
400
418
 
401
419
  Returns:
@@ -404,7 +422,18 @@ def _get_pyscript_paths(path_or_pkgname: str, ban_fn_patterns: Optional[list[str
404
422
  path = Path(path_or_pkgname)
405
423
  pypaths: list[Path]
406
424
  if path.is_dir():
407
- pypaths = list(path.rglob("*.py", case_sensitive=False))
425
+ glob_patterns = glob_patterns if isinstance(glob_patterns, (tuple, list)) else [glob_patterns]
426
+ pypaths = []
427
+ for pattern in glob_patterns:
428
+ if "**" in pattern:
429
+ regex = _pattern_to_regex(pattern)
430
+ pypaths.extend(
431
+ p for p in path.rglob("**/*", case_sensitive=case_sensitive) if regex.match(p.as_posix())
432
+ )
433
+ else:
434
+ pypaths += list(path.rglob(pattern, case_sensitive=case_sensitive))
435
+
436
+ # pypaths = list(path.rglob(glob_pattern, case_sensitive=case_sensitive))
408
437
  elif path.is_file():
409
438
  pypaths = [path]
410
439
  else: