kash-shell 0.3.34__py3-none-any.whl → 0.3.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import os
3
4
  import shutil
4
5
  import subprocess
6
+ from logging import getLogger
5
7
  from pathlib import Path
6
8
 
9
+ from dotenv import find_dotenv, load_dotenv
7
10
  from sidematter_format.sidematter_format import Sidematter
11
+ from strif import abbrev_str
8
12
 
9
13
  from kash.utils.common.url import Url, is_s3_url, parse_s3_url
10
14
 
15
+ log = getLogger(__name__)
16
+
11
17
 
12
18
  def check_aws_cli() -> None:
13
19
  """
@@ -19,6 +25,54 @@ def check_aws_cli() -> None:
19
25
  )
20
26
 
21
27
 
28
+ def run_aws_command(cmd: list[str]) -> subprocess.CompletedProcess[str]:
29
+ """
30
+ Run an AWS CLI command and capture output.
31
+ Raises a RuntimeError with stdout/stderr on failure.
32
+ """
33
+ result = subprocess.run(
34
+ cmd,
35
+ capture_output=True,
36
+ text=True,
37
+ env=os.environ,
38
+ )
39
+
40
+ if result.returncode != 0:
41
+ # Build a detailed error message
42
+ error_parts = [f"AWS command failed with exit code {result.returncode}"]
43
+ error_parts.append(f"Command: {' '.join(cmd)}")
44
+
45
+ if result.stdout:
46
+ error_parts.append(f"stdout: {result.stdout}")
47
+ if result.stderr:
48
+ error_parts.append(f"stderr: {result.stderr}")
49
+
50
+ raise RuntimeError("\n".join(error_parts))
51
+
52
+ return result
53
+
54
+
55
+ def reload_aws_env_vars() -> None:
56
+ """
57
+ Fresh reload of AWS env vars from .env.local.
58
+ """
59
+
60
+ def aws_creds() -> set[tuple[str, str]]:
61
+ return {(k, abbrev_str(v, 5)) for k, v in os.environ.items() if k.startswith("AWS_")}
62
+
63
+ if len(aws_creds()) == 0:
64
+ dotenv_path = find_dotenv(".env.local", usecwd=True) or find_dotenv(".env", usecwd=True)
65
+ load_dotenv(dotenv_path, override=True)
66
+ if len(aws_creds()) > 0:
67
+ log.info(
68
+ "Loaded %s, found AWS credentials: %s",
69
+ dotenv_path,
70
+ aws_creds(),
71
+ )
72
+ else:
73
+ log.warning("No AWS credentials found in env or .env files")
74
+
75
+
22
76
  def get_s3_parent_folder(url: Url) -> Url | None:
23
77
  """
24
78
  Get the parent folder of an S3 URL, or None if not an S3 URL.
@@ -47,6 +101,7 @@ def s3_sync_to_folder(
47
101
  - For a single file: the file URL (and sidematter file/dir URLs if included).
48
102
  - For a directory: the destination parent prefix URL (non-recursive reporting).
49
103
  """
104
+ reload_aws_env_vars()
50
105
 
51
106
  src_path = Path(src_path)
52
107
  if not src_path.exists():
@@ -71,7 +126,7 @@ def s3_sync_to_folder(
71
126
  for p in sync_paths:
72
127
  if p.is_file():
73
128
  # Use sync with include/exclude to leverage default short-circuiting
74
- subprocess.run(
129
+ run_aws_command(
75
130
  [
76
131
  "aws",
77
132
  "s3",
@@ -82,27 +137,54 @@ def s3_sync_to_folder(
82
137
  "*",
83
138
  "--include",
84
139
  p.name,
85
- ],
86
- check=True,
140
+ ]
87
141
  )
88
142
  targets.append(Url(dest_prefix + p.name))
89
143
  elif p.is_dir():
90
144
  dest_dir = dest_prefix + p.name + "/"
91
- subprocess.run(["aws", "s3", "sync", str(p), dest_dir], check=True)
145
+ run_aws_command(["aws", "s3", "sync", str(p), dest_dir])
92
146
  targets.append(Url(dest_dir))
93
147
 
94
148
  return targets
95
149
  else:
96
150
  # Directory mode: sync whole directory.
97
- subprocess.run(
151
+ run_aws_command(
98
152
  [
99
153
  "aws",
100
154
  "s3",
101
155
  "sync",
102
156
  str(src_path),
103
157
  dest_prefix,
104
- ],
105
- check=True,
158
+ ]
106
159
  )
107
160
  targets.append(Url(dest_prefix))
108
161
  return targets
162
+
163
+
164
+ def s3_download_file(s3_url: Url, target_path: str | Path) -> None:
165
+ """
166
+ Download a file from S3 to a local path using the AWS CLI.
167
+
168
+ Args:
169
+ s3_url: The S3 URL to download from (s3://bucket/path/to/file)
170
+ target_path: The local path to save the file to
171
+ """
172
+ reload_aws_env_vars()
173
+
174
+ if not is_s3_url(s3_url):
175
+ raise ValueError(f"Source must be an s3:// URL: {s3_url}")
176
+
177
+ check_aws_cli()
178
+
179
+ target_path = Path(target_path)
180
+
181
+ # Use aws s3 cp to download the file
182
+ run_aws_command(
183
+ [
184
+ "aws",
185
+ "s3",
186
+ "cp",
187
+ str(s3_url),
188
+ str(target_path),
189
+ ]
190
+ )
@@ -26,7 +26,6 @@ def fetch_page_content(
26
26
 
27
27
  Force re-fetching and updating the cache by setting `refetch` to true.
28
28
 
29
-
30
29
  For HTML and other text files, uses the `text_extractor` to extract
31
30
  clean text and page metadata.
32
31
  """
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import logging
4
+ import ssl
5
+ from collections.abc import Iterable
4
6
  from dataclasses import dataclass
5
7
  from enum import Enum
6
8
  from functools import cache, cached_property
@@ -12,17 +14,145 @@ from cachetools import TTLCache
12
14
  from strif import atomic_output_file, copyfile_atomic
13
15
 
14
16
  from kash.config.env_settings import KashEnv
17
+ from kash.utils.common.s3_utils import s3_download_file
15
18
  from kash.utils.common.url import Url
16
19
  from kash.utils.file_utils.file_formats import MimeType
17
20
 
21
+ log = logging.getLogger(__name__)
22
+
23
+
24
+ def _httpx_verify_context() -> ssl.SSLContext | bool:
25
+ """
26
+ Return an SSLContext that uses the system trust store via truststore, if available.
27
+ Falls back to certifi bundle; otherwise True to use httpx defaults.
28
+ """
29
+ try:
30
+ import truststore
31
+
32
+ return truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
33
+ except Exception:
34
+ try:
35
+ import certifi
36
+
37
+ return ssl.create_default_context(cafile=certifi.where())
38
+ except Exception:
39
+ return True
40
+
41
+
42
+ def _stream_to_file(
43
+ target_filename: str | Path,
44
+ response_iterator: Iterable[bytes],
45
+ total_size: int,
46
+ show_progress: bool,
47
+ ) -> None:
48
+ with atomic_output_file(target_filename, make_parents=True) as temp_filename:
49
+ with open(temp_filename, "wb") as f:
50
+ if not show_progress:
51
+ for chunk in response_iterator:
52
+ if chunk:
53
+ f.write(chunk)
54
+ else:
55
+ from tqdm import tqdm
56
+
57
+ with tqdm(
58
+ total=total_size,
59
+ unit="B",
60
+ unit_scale=True,
61
+ desc=f"Downloading {Path(str(target_filename)).name}",
62
+ ) as progress:
63
+ for chunk in response_iterator:
64
+ if chunk:
65
+ f.write(chunk)
66
+ progress.update(len(chunk))
67
+
68
+
69
+ def _httpx_fetch(
70
+ url: Url,
71
+ *,
72
+ timeout: int,
73
+ auth: Any | None,
74
+ headers: dict[str, str] | None,
75
+ mode: ClientMode,
76
+ log_label: str,
77
+ ):
78
+ import httpx
79
+
80
+ req_headers = _get_req_headers(mode, headers)
81
+ parsed_url = urlparse(str(url))
82
+ with httpx.Client(
83
+ follow_redirects=True,
84
+ timeout=timeout,
85
+ auth=auth,
86
+ headers=req_headers,
87
+ verify=_httpx_verify_context(),
88
+ ) as client:
89
+ log.debug("fetch_url (%s): using headers: %s", log_label, client.headers)
90
+ if mode is ClientMode.BROWSER_HEADERS:
91
+ _prime_host(parsed_url.netloc, client, timeout)
92
+ response = client.get(url)
93
+ log.info(
94
+ "Fetched (%s): %s (%s bytes): %s",
95
+ log_label,
96
+ response.status_code,
97
+ len(response.content),
98
+ url,
99
+ )
100
+ response.raise_for_status()
101
+ return response
102
+
103
+
104
+ def _httpx_download(
105
+ url: Url,
106
+ target_filename: str | Path,
107
+ *,
108
+ show_progress: bool,
109
+ timeout: int,
110
+ auth: Any | None,
111
+ headers: dict[str, str] | None,
112
+ mode: ClientMode,
113
+ log_label: str,
114
+ ) -> dict[str, str]:
115
+ import httpx
116
+
117
+ req_headers = _get_req_headers(mode, headers)
118
+ parsed_url = urlparse(str(url))
119
+ with httpx.Client(
120
+ follow_redirects=True,
121
+ timeout=timeout,
122
+ headers=req_headers,
123
+ verify=_httpx_verify_context(),
124
+ ) as client:
125
+ if mode is ClientMode.BROWSER_HEADERS:
126
+ _prime_host(parsed_url.netloc, client, timeout)
127
+ log.debug("download_url (%s): using headers: %s", log_label, client.headers)
128
+ with client.stream("GET", url, auth=auth, follow_redirects=True) as response:
129
+ response.raise_for_status()
130
+ response_headers = dict(response.headers)
131
+ total = int(response.headers.get("content-length", "0"))
132
+ _stream_to_file(target_filename, response.iter_bytes(), total, show_progress)
133
+ return response_headers
134
+
135
+
136
+ def _is_tls_cert_error(exc: Exception) -> bool:
137
+ """
138
+ Heuristic detection of TLS/certificate verification errors coming from curl_cffi/libcurl.
139
+ """
140
+ s = str(exc).lower()
141
+ if "curl: (60)" in s:
142
+ return True
143
+ if "certificate verify failed" in s:
144
+ return True
145
+ if "ssl" in s and ("certificate" in s or "cert" in s or "handshake" in s):
146
+ return True
147
+ return False
148
+
149
+
18
150
  if TYPE_CHECKING:
19
151
  from curl_cffi.requests import Response as CurlCffiResponse
20
152
  from curl_cffi.requests import Session as CurlCffiSession
21
153
  from httpx import Client as HttpxClient
22
154
  from httpx import Response as HttpxResponse
23
155
 
24
- log = logging.getLogger(__name__)
25
-
26
156
 
27
157
  DEFAULT_TIMEOUT = 30
28
158
  CURL_CFFI_IMPERSONATE_VERSION = "chrome120"
@@ -199,54 +329,57 @@ def fetch_url(
199
329
 
200
330
  from curl_cffi.requests import Session
201
331
 
202
- with Session() as client:
203
- # Set headers on the session - they will be sent with all requests
204
- client.headers.update(req_headers)
205
- _prime_host(
206
- parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
332
+ exc: Exception | None = None
333
+ try:
334
+ with Session() as client:
335
+ # Set headers on the session - they will be sent with all requests
336
+ client.headers.update(req_headers)
337
+ _prime_host(
338
+ parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
339
+ )
340
+ log.debug("fetch_url (curl_cffi): using session headers: %s", client.headers)
341
+ response = client.get(
342
+ url,
343
+ impersonate=CURL_CFFI_IMPERSONATE_VERSION,
344
+ timeout=timeout,
345
+ auth=auth,
346
+ allow_redirects=True,
347
+ )
348
+ log.info(
349
+ "Fetched (curl_cffi): %s (%s bytes): %s",
350
+ response.status_code,
351
+ len(response.content),
352
+ url,
353
+ )
354
+ response.raise_for_status()
355
+ return response
356
+ except Exception as e:
357
+ exc = e
358
+
359
+ if exc and _is_tls_cert_error(exc):
360
+ log.warning(
361
+ "TLS/SSL verification failed with curl_cffi for %s: %s; falling back to httpx",
362
+ url,
363
+ exc,
207
364
  )
208
- log.debug("fetch_url (curl_cffi): using session headers: %s", client.headers)
209
- response = client.get(
365
+ # Fallback to httpx with browser-like headers (uses system trust if available)
366
+ return _httpx_fetch(
210
367
  url,
211
- impersonate=CURL_CFFI_IMPERSONATE_VERSION,
212
368
  timeout=timeout,
213
369
  auth=auth,
214
- allow_redirects=True,
215
- )
216
- log.info(
217
- "Fetched (curl_cffi): %s (%s bytes): %s",
218
- response.status_code,
219
- len(response.content),
220
- url,
370
+ headers=headers,
371
+ mode=ClientMode.BROWSER_HEADERS,
372
+ log_label="httpx fallback",
221
373
  )
222
- response.raise_for_status()
223
- return response
374
+
375
+ if exc:
376
+ raise exc
224
377
 
225
378
  # Handle httpx modes
226
379
  else:
227
- import httpx
228
-
229
- with httpx.Client(
230
- follow_redirects=True,
231
- timeout=timeout,
232
- auth=auth,
233
- headers=req_headers,
234
- ) as client:
235
- log.debug("fetch_url (httpx): using headers: %s", client.headers)
236
-
237
- # Cookie priming only makes sense for the browser-like mode
238
- if mode is ClientMode.BROWSER_HEADERS:
239
- _prime_host(parsed_url.netloc, client, timeout)
240
-
241
- response = client.get(url)
242
- log.info(
243
- "Fetched (httpx): %s (%s bytes): %s",
244
- response.status_code,
245
- len(response.content),
246
- url,
247
- )
248
- response.raise_for_status()
249
- return response
380
+ return _httpx_fetch(
381
+ url, timeout=timeout, auth=auth, headers=headers, mode=mode, log_label="httpx"
382
+ )
250
383
 
251
384
 
252
385
  @dataclass(frozen=True)
@@ -289,38 +422,13 @@ def download_url(
289
422
  copyfile_atomic(parsed_url.netloc + parsed_url.path, target_filename, make_parents=True)
290
423
  return None
291
424
  elif parsed_url.scheme == "s3":
292
- import boto3 # pyright: ignore
293
-
294
- s3 = boto3.resource("s3")
295
- s3_path = parsed_url.path.lstrip("/")
296
425
  with atomic_output_file(target_filename, make_parents=True) as temp_filename:
297
- s3.Bucket(parsed_url.netloc).download_file(s3_path, temp_filename)
426
+ s3_download_file(url, temp_filename)
298
427
  return None
299
428
 
300
429
  req_headers = _get_req_headers(mode, headers)
301
430
  response_headers = None
302
431
 
303
- def stream_to_file(response_iterator, total_size):
304
- with atomic_output_file(target_filename, make_parents=True) as temp_filename:
305
- with open(temp_filename, "wb") as f:
306
- if not show_progress:
307
- for chunk in response_iterator:
308
- if chunk: # Skip empty chunks
309
- f.write(chunk)
310
- else:
311
- from tqdm import tqdm
312
-
313
- with tqdm(
314
- total=total_size,
315
- unit="B",
316
- unit_scale=True,
317
- desc=f"Downloading {Path(target_filename).name}",
318
- ) as progress:
319
- for chunk in response_iterator:
320
- if chunk: # Skip empty chunks
321
- f.write(chunk)
322
- progress.update(len(chunk))
323
-
324
432
  # Handle curl_cffi mode
325
433
  if mode is ClientMode.CURL_CFFI:
326
434
  if not _have_curl_cffi():
@@ -328,47 +436,111 @@ def download_url(
328
436
 
329
437
  from curl_cffi.requests import Session
330
438
 
331
- with Session() as client:
332
- # Set headers on the session; they will be sent with all requests
333
- client.headers.update(req_headers)
334
- _prime_host(
335
- parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
336
- )
337
- log.debug("download_url (curl_cffi): using session headers: %s", client.headers)
439
+ exc: Exception | None = None
440
+ try:
441
+ with Session() as client:
442
+ # Set headers on the session; they will be sent with all requests
443
+ client.headers.update(req_headers)
444
+ _prime_host(
445
+ parsed_url.netloc, client, timeout, impersonate=CURL_CFFI_IMPERSONATE_VERSION
446
+ )
447
+ log.debug("download_url (curl_cffi): using session headers: %s", client.headers)
448
+ response = client.get(
449
+ url,
450
+ impersonate=CURL_CFFI_IMPERSONATE_VERSION,
451
+ timeout=timeout,
452
+ auth=auth,
453
+ allow_redirects=True,
454
+ stream=True,
455
+ )
456
+ response.raise_for_status()
457
+ response_headers = dict(response.headers)
458
+ total = int(response.headers.get("content-length", "0"))
459
+
460
+ # Use iter_content for streaming; this is the standard method for curl_cffi
461
+ chunk_iterator = response.iter_content(chunk_size=8192)
462
+ _stream_to_file(target_filename, chunk_iterator, total, show_progress)
463
+ except Exception as e:
464
+ exc = e
338
465
 
339
- response = client.get(
466
+ if exc and _is_tls_cert_error(exc):
467
+ log.warning(
468
+ "TLS/SSL verification failed with curl_cffi for %s: %s; falling back to httpx",
340
469
  url,
341
- impersonate=CURL_CFFI_IMPERSONATE_VERSION,
470
+ exc,
471
+ )
472
+ # Fallback to httpx streaming with browser-like headers (system trust store if available)
473
+ response_headers = _httpx_download(
474
+ url,
475
+ target_filename,
476
+ show_progress=show_progress,
342
477
  timeout=timeout,
343
478
  auth=auth,
344
- allow_redirects=True,
345
- stream=True,
479
+ headers=headers,
480
+ mode=ClientMode.BROWSER_HEADERS,
481
+ log_label="httpx fallback",
346
482
  )
347
- response.raise_for_status()
348
- response_headers = dict(response.headers)
349
- total = int(response.headers.get("content-length", "0"))
350
-
351
- # Use iter_content for streaming; this is the standard method for curl_cffi
352
- chunk_iterator = response.iter_content(chunk_size=8192)
353
- stream_to_file(chunk_iterator, total)
483
+ elif exc:
484
+ raise exc
354
485
 
355
486
  # Handle httpx modes
356
487
  else:
357
- import httpx
358
-
359
- with httpx.Client(follow_redirects=True, timeout=timeout, headers=req_headers) as client:
360
- if mode is ClientMode.BROWSER_HEADERS:
361
- _prime_host(parsed_url.netloc, client, timeout)
362
-
363
- log.debug("download_url (httpx): using headers: %s", client.headers)
364
- with client.stream("GET", url, auth=auth, follow_redirects=True) as response:
365
- response.raise_for_status()
366
- response_headers = dict(response.headers)
367
- total = int(response.headers.get("content-length", "0"))
368
- stream_to_file(response.iter_bytes(), total)
488
+ response_headers = _httpx_download(
489
+ url,
490
+ target_filename,
491
+ show_progress=show_progress,
492
+ timeout=timeout,
493
+ auth=auth,
494
+ headers=headers,
495
+ mode=mode,
496
+ log_label="httpx",
497
+ )
369
498
 
370
499
  # Filter out None values from headers for HttpHeaders type compatibility
371
500
  if response_headers:
372
501
  clean_headers = {k: v for k, v in response_headers.items() if v is not None}
373
502
  return HttpHeaders(clean_headers)
374
503
  return None
504
+
505
+
506
+ def main() -> None:
507
+ """
508
+ Simple CLI test harness for fetch and download.
509
+
510
+ Usage examples:
511
+ uv run python -m kash.web_content.web_fetch
512
+ uv run python -m kash.web_content.web_fetch https://www.example.com
513
+ """
514
+ import sys
515
+ import traceback
516
+
517
+ # Try to use the system trust store for TLS like command-line curl
518
+ try:
519
+ import truststore # type: ignore
520
+
521
+ truststore.inject_into_ssl()
522
+ log.warning("truststore initialized for test harness: using system TLS trust store")
523
+ except Exception as exc:
524
+ log.info("truststore not available for test harness; using default TLS trust (%s)", exc)
525
+
526
+ urls = [
527
+ "https://www.example.com",
528
+ "https://www.businessdefense.gov/ibr/mceip/dpai/dpat3/index.html",
529
+ ]
530
+
531
+ args = [a for a in sys.argv[1:] if a and a.strip()]
532
+ if args:
533
+ urls = args
534
+
535
+ for u in urls:
536
+ try:
537
+ log.warning("Testing fetch_url: %s", u)
538
+ r = fetch_url(Url(u))
539
+ log.warning("fetch_url OK: %s -> %s bytes", u, len(r.content))
540
+ except Exception as exc:
541
+ log.exception("fetch_url FAILED for %s: %s", u, exc)
542
+ traceback.print_exc()
543
+
544
+
545
+ if __name__ == "__main__":
546
+ main()
@@ -95,6 +95,8 @@ def get_ws(name_or_path: str | Path, auto_init: bool = True) -> FileStore:
95
95
  Get a workspace by name or path. Adds to the in-memory registry so we reuse it.
96
96
  With `auto_init` true, will initialize the workspace if it is not already initialized.
97
97
  """
98
+ if isinstance(name_or_path, Path):
99
+ name_or_path = name_or_path.expanduser().absolute()
98
100
  name = Path(name_or_path).name
99
101
  name = check_strict_workspace_name(name)
100
102
  info = resolve_ws(name_or_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kash-shell
3
- Version: 0.3.34
3
+ Version: 0.3.35
4
4
  Summary: The knowledge agent shell (core)
5
5
  Project-URL: Repository, https://github.com/jlevy/kash-shell
6
6
  Author-email: Joshua Levy <joshua@cal.berkeley.edu>
@@ -72,6 +72,7 @@ Requires-Dist: thefuzz>=0.22.1
72
72
  Requires-Dist: tiktoken>=0.9.0
73
73
  Requires-Dist: tldr>=3.3.0
74
74
  Requires-Dist: tminify>=0.1.6
75
+ Requires-Dist: truststore>=0.10.4
75
76
  Requires-Dist: typing-extensions>=4.12.2
76
77
  Requires-Dist: uvicorn>=0.34.0
77
78
  Requires-Dist: xonsh>=0.19.3