fsspec 2024.12.0__py3-none-any.whl → 2025.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,932 @@
1
+ """This file is largely copied from http.py"""
2
+
3
+ import io
4
+ import logging
5
+ import re
6
+ import urllib.error
7
+ import urllib.parse
8
+ from copy import copy
9
+ from json import dumps, loads
10
+ from urllib.parse import urlparse
11
+
12
+ try:
13
+ import yarl
14
+ except (ImportError, ModuleNotFoundError, OSError):
15
+ yarl = False
16
+
17
+ from fsspec.callbacks import _DEFAULT_CALLBACK
18
+ from fsspec.registry import register_implementation
19
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
20
+ from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
21
+
22
+ from ..caching import AllBytes
23
+
24
+ # https://stackoverflow.com/a/15926317/3821154
25
+ ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
26
+ ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
27
+ logger = logging.getLogger("fsspec.http")
28
+
29
+
30
+ class JsHttpException(urllib.error.HTTPError): ...
31
+
32
+
33
+ class StreamIO(io.BytesIO):
34
+ # fake class, so you can set attributes on it
35
+ # will eventually actually stream
36
+ ...
37
+
38
+
39
+ class ResponseProxy:
40
+ """Looks like a requests response"""
41
+
42
+ def __init__(self, req, stream=False):
43
+ self.request = req
44
+ self.stream = stream
45
+ self._data = None
46
+ self._headers = None
47
+
48
+ @property
49
+ def raw(self):
50
+ if self._data is None:
51
+ b = self.request.response.to_bytes()
52
+ if self.stream:
53
+ self._data = StreamIO(b)
54
+ else:
55
+ self._data = b
56
+ return self._data
57
+
58
+ def close(self):
59
+ if hasattr(self, "_data"):
60
+ del self._data
61
+
62
+ @property
63
+ def headers(self):
64
+ if self._headers is None:
65
+ self._headers = dict(
66
+ [
67
+ _.split(": ")
68
+ for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
69
+ ]
70
+ )
71
+ return self._headers
72
+
73
+ @property
74
+ def status_code(self):
75
+ return int(self.request.status)
76
+
77
+ def raise_for_status(self):
78
+ if not self.ok:
79
+ raise JsHttpException(
80
+ self.url, self.status_code, self.reason, self.headers, None
81
+ )
82
+
83
+ def iter_content(self, chunksize, *_, **__):
84
+ while True:
85
+ out = self.raw.read(chunksize)
86
+ if out:
87
+ yield out
88
+ else:
89
+ break
90
+
91
+ @property
92
+ def reason(self):
93
+ return self.request.statusText
94
+
95
+ @property
96
+ def ok(self):
97
+ return self.status_code < 400
98
+
99
+ @property
100
+ def url(self):
101
+ return self.request.response.responseURL
102
+
103
+ @property
104
+ def text(self):
105
+ # TODO: encoding from headers
106
+ return self.content.decode()
107
+
108
+ @property
109
+ def content(self):
110
+ self.stream = False
111
+ return self.raw
112
+
113
+ @property
114
+ def json(self):
115
+ return loads(self.text)
116
+
117
+
118
+ class RequestsSessionShim:
119
+ def __init__(self):
120
+ self.headers = {}
121
+
122
+ def request(
123
+ self,
124
+ method,
125
+ url,
126
+ params=None,
127
+ data=None,
128
+ headers=None,
129
+ cookies=None,
130
+ files=None,
131
+ auth=None,
132
+ timeout=None,
133
+ allow_redirects=None,
134
+ proxies=None,
135
+ hooks=None,
136
+ stream=None,
137
+ verify=None,
138
+ cert=None,
139
+ json=None,
140
+ ):
141
+ from js import Blob, XMLHttpRequest
142
+
143
+ logger.debug("JS request: %s %s", method, url)
144
+
145
+ if cert or verify or proxies or files or cookies or hooks:
146
+ raise NotImplementedError
147
+ if data and json:
148
+ raise ValueError("Use json= or data=, not both")
149
+ req = XMLHttpRequest.new()
150
+ extra = auth if auth else ()
151
+ if params:
152
+ url = f"{url}?{urllib.parse.urlencode(params)}"
153
+ req.open(method, url, False, *extra)
154
+ if timeout:
155
+ req.timeout = timeout
156
+ if headers:
157
+ for k, v in headers.items():
158
+ req.setRequestHeader(k, v)
159
+
160
+ req.setRequestHeader("Accept", "application/octet-stream")
161
+ req.responseType = "arraybuffer"
162
+ if json:
163
+ blob = Blob.new([dumps(data)], {type: "application/json"})
164
+ req.send(blob)
165
+ elif data:
166
+ if isinstance(data, io.IOBase):
167
+ data = data.read()
168
+ blob = Blob.new([data], {type: "application/octet-stream"})
169
+ req.send(blob)
170
+ else:
171
+ req.send(None)
172
+ return ResponseProxy(req, stream=stream)
173
+
174
+ def get(self, url, **kwargs):
175
+ return self.request("GET", url, **kwargs)
176
+
177
+ def head(self, url, **kwargs):
178
+ return self.request("HEAD", url, **kwargs)
179
+
180
+ def post(self, url, **kwargs):
181
+ return self.request("POST}", url, **kwargs)
182
+
183
+ def put(self, url, **kwargs):
184
+ return self.request("PUT", url, **kwargs)
185
+
186
+ def patch(self, url, **kwargs):
187
+ return self.request("PATCH", url, **kwargs)
188
+
189
+ def delete(self, url, **kwargs):
190
+ return self.request("DELETE", url, **kwargs)
191
+
192
+
193
+ class HTTPFileSystem(AbstractFileSystem):
194
+ """
195
+ Simple File-System for fetching data via HTTP(S)
196
+
197
+ This is the BLOCKING version of the normal HTTPFileSystem. It uses
198
+ requests in normal python and the JS runtime in pyodide.
199
+
200
+ ***This implementation is extremely experimental, do not use unless
201
+ you are testing pyodide/pyscript integration***
202
+ """
203
+
204
+ protocol = ("http", "https", "sync_http", "sync_https")
205
+ sep = "/"
206
+
207
+ def __init__(
208
+ self,
209
+ simple_links=True,
210
+ block_size=None,
211
+ same_scheme=True,
212
+ cache_type="readahead",
213
+ cache_options=None,
214
+ client_kwargs=None,
215
+ encoded=False,
216
+ **storage_options,
217
+ ):
218
+ """
219
+
220
+ Parameters
221
+ ----------
222
+ block_size: int
223
+ Blocks to read bytes; if 0, will default to raw requests file-like
224
+ objects instead of HTTPFile instances
225
+ simple_links: bool
226
+ If True, will consider both HTML <a> tags and anything that looks
227
+ like a URL; if False, will consider only the former.
228
+ same_scheme: True
229
+ When doing ls/glob, if this is True, only consider paths that have
230
+ http/https matching the input URLs.
231
+ size_policy: this argument is deprecated
232
+ client_kwargs: dict
233
+ Passed to aiohttp.ClientSession, see
234
+ https://docs.aiohttp.org/en/stable/client_reference.html
235
+ For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
236
+ storage_options: key-value
237
+ Any other parameters passed on to requests
238
+ cache_type, cache_options: defaults used in open
239
+ """
240
+ super().__init__(self, **storage_options)
241
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
242
+ self.simple_links = simple_links
243
+ self.same_schema = same_scheme
244
+ self.cache_type = cache_type
245
+ self.cache_options = cache_options
246
+ self.client_kwargs = client_kwargs or {}
247
+ self.encoded = encoded
248
+ self.kwargs = storage_options
249
+
250
+ try:
251
+ import js # noqa: F401
252
+
253
+ logger.debug("Starting JS session")
254
+ self.session = RequestsSessionShim()
255
+ self.js = True
256
+ except Exception as e:
257
+ import requests
258
+
259
+ logger.debug("Starting cpython session because of: %s", e)
260
+ self.session = requests.Session(**(client_kwargs or {}))
261
+ self.js = False
262
+
263
+ request_options = copy(storage_options)
264
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
265
+ request_options.pop("listings_expiry_time", None)
266
+ request_options.pop("max_paths", None)
267
+ request_options.pop("skip_instance_cache", None)
268
+ self.kwargs = request_options
269
+
270
+ @property
271
+ def fsid(self):
272
+ return "http_sync"
273
+
274
+ def encode_url(self, url):
275
+ if yarl:
276
+ return yarl.URL(url, encoded=self.encoded)
277
+ return url
278
+
279
+ @classmethod
280
+ def _strip_protocol(cls, path: str) -> str:
281
+ """For HTTP, we always want to keep the full URL"""
282
+ path = path.replace("http_sync://", "http://").replace(
283
+ "https_sync://", "https://"
284
+ )
285
+ return path
286
+
287
+ @classmethod
288
+ def _parent(cls, path):
289
+ # override, since _strip_protocol is different for URLs
290
+ par = super()._parent(path)
291
+ if len(par) > 7: # "http://..."
292
+ return par
293
+ return ""
294
+
295
+ def _ls_real(self, url, detail=True, **kwargs):
296
+ # ignoring URL-encoded arguments
297
+ kw = self.kwargs.copy()
298
+ kw.update(kwargs)
299
+ logger.debug(url)
300
+ r = self.session.get(self.encode_url(url), **self.kwargs)
301
+ self._raise_not_found_for_status(r, url)
302
+ text = r.text
303
+ if self.simple_links:
304
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
305
+ else:
306
+ links = [u[2] for u in ex.findall(text)]
307
+ out = set()
308
+ parts = urlparse(url)
309
+ for l in links:
310
+ if isinstance(l, tuple):
311
+ l = l[1]
312
+ if l.startswith("/") and len(l) > 1:
313
+ # absolute URL on this server
314
+ l = parts.scheme + "://" + parts.netloc + l
315
+ if l.startswith("http"):
316
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
317
+ out.add(l)
318
+ elif l.replace("https", "http").startswith(
319
+ url.replace("https", "http").rstrip("/") + "/"
320
+ ):
321
+ # allowed to cross http <-> https
322
+ out.add(l)
323
+ else:
324
+ if l not in ["..", "../"]:
325
+ # Ignore FTP-like "parent"
326
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
327
+ if not out and url.endswith("/"):
328
+ out = self._ls_real(url.rstrip("/"), detail=False)
329
+ if detail:
330
+ return [
331
+ {
332
+ "name": u,
333
+ "size": None,
334
+ "type": "directory" if u.endswith("/") else "file",
335
+ }
336
+ for u in out
337
+ ]
338
+ else:
339
+ return sorted(out)
340
+
341
+ def ls(self, url, detail=True, **kwargs):
342
+ if self.use_listings_cache and url in self.dircache:
343
+ out = self.dircache[url]
344
+ else:
345
+ out = self._ls_real(url, detail=detail, **kwargs)
346
+ self.dircache[url] = out
347
+ return out
348
+
349
+ def _raise_not_found_for_status(self, response, url):
350
+ """
351
+ Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
352
+ """
353
+ if response.status_code == 404:
354
+ raise FileNotFoundError(url)
355
+ response.raise_for_status()
356
+
357
+ def cat_file(self, url, start=None, end=None, **kwargs):
358
+ kw = self.kwargs.copy()
359
+ kw.update(kwargs)
360
+ logger.debug(url)
361
+
362
+ if start is not None or end is not None:
363
+ if start == end:
364
+ return b""
365
+ headers = kw.pop("headers", {}).copy()
366
+
367
+ headers["Range"] = self._process_limits(url, start, end)
368
+ kw["headers"] = headers
369
+ r = self.session.get(self.encode_url(url), **kw)
370
+ self._raise_not_found_for_status(r, url)
371
+ return r.content
372
+
373
+ def get_file(
374
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
375
+ ):
376
+ kw = self.kwargs.copy()
377
+ kw.update(kwargs)
378
+ logger.debug(rpath)
379
+ r = self.session.get(self.encode_url(rpath), **kw)
380
+ try:
381
+ size = int(
382
+ r.headers.get("content-length", None)
383
+ or r.headers.get("Content-Length", None)
384
+ )
385
+ except (ValueError, KeyError, TypeError):
386
+ size = None
387
+
388
+ callback.set_size(size)
389
+ self._raise_not_found_for_status(r, rpath)
390
+ if not isfilelike(lpath):
391
+ lpath = open(lpath, "wb")
392
+ for chunk in r.iter_content(chunk_size, decode_unicode=False):
393
+ lpath.write(chunk)
394
+ callback.relative_update(len(chunk))
395
+
396
+ def put_file(
397
+ self,
398
+ lpath,
399
+ rpath,
400
+ chunk_size=5 * 2**20,
401
+ callback=_DEFAULT_CALLBACK,
402
+ method="post",
403
+ **kwargs,
404
+ ):
405
+ def gen_chunks():
406
+ # Support passing arbitrary file-like objects
407
+ # and use them instead of streams.
408
+ if isinstance(lpath, io.IOBase):
409
+ context = nullcontext(lpath)
410
+ use_seek = False # might not support seeking
411
+ else:
412
+ context = open(lpath, "rb")
413
+ use_seek = True
414
+
415
+ with context as f:
416
+ if use_seek:
417
+ callback.set_size(f.seek(0, 2))
418
+ f.seek(0)
419
+ else:
420
+ callback.set_size(getattr(f, "size", None))
421
+
422
+ chunk = f.read(chunk_size)
423
+ while chunk:
424
+ yield chunk
425
+ callback.relative_update(len(chunk))
426
+ chunk = f.read(chunk_size)
427
+
428
+ kw = self.kwargs.copy()
429
+ kw.update(kwargs)
430
+
431
+ method = method.lower()
432
+ if method not in ("post", "put"):
433
+ raise ValueError(
434
+ f"method has to be either 'post' or 'put', not: {method!r}"
435
+ )
436
+
437
+ meth = getattr(self.session, method)
438
+ resp = meth(rpath, data=gen_chunks(), **kw)
439
+ self._raise_not_found_for_status(resp, rpath)
440
+
441
+ def _process_limits(self, url, start, end):
442
+ """Helper for "Range"-based _cat_file"""
443
+ size = None
444
+ suff = False
445
+ if start is not None and start < 0:
446
+ # if start is negative and end None, end is the "suffix length"
447
+ if end is None:
448
+ end = -start
449
+ start = ""
450
+ suff = True
451
+ else:
452
+ size = size or self.info(url)["size"]
453
+ start = size + start
454
+ elif start is None:
455
+ start = 0
456
+ if not suff:
457
+ if end is not None and end < 0:
458
+ if start is not None:
459
+ size = size or self.info(url)["size"]
460
+ end = size + end
461
+ elif end is None:
462
+ end = ""
463
+ if isinstance(end, int):
464
+ end -= 1 # bytes range is inclusive
465
+ return f"bytes={start}-{end}"
466
+
467
+ def exists(self, path, **kwargs):
468
+ kw = self.kwargs.copy()
469
+ kw.update(kwargs)
470
+ try:
471
+ logger.debug(path)
472
+ r = self.session.get(self.encode_url(path), **kw)
473
+ return r.status_code < 400
474
+ except Exception:
475
+ return False
476
+
477
+ def isfile(self, path, **kwargs):
478
+ return self.exists(path, **kwargs)
479
+
480
+ def _open(
481
+ self,
482
+ path,
483
+ mode="rb",
484
+ block_size=None,
485
+ autocommit=None, # XXX: This differs from the base class.
486
+ cache_type=None,
487
+ cache_options=None,
488
+ size=None,
489
+ **kwargs,
490
+ ):
491
+ """Make a file-like object
492
+
493
+ Parameters
494
+ ----------
495
+ path: str
496
+ Full URL with protocol
497
+ mode: string
498
+ must be "rb"
499
+ block_size: int or None
500
+ Bytes to download in one request; use instance value if None. If
501
+ zero, will return a streaming Requests file-like instance.
502
+ kwargs: key-value
503
+ Any other parameters, passed to requests calls
504
+ """
505
+ if mode != "rb":
506
+ raise NotImplementedError
507
+ block_size = block_size if block_size is not None else self.block_size
508
+ kw = self.kwargs.copy()
509
+ kw.update(kwargs)
510
+ size = size or self.info(path, **kwargs)["size"]
511
+ if block_size and size:
512
+ return HTTPFile(
513
+ self,
514
+ path,
515
+ session=self.session,
516
+ block_size=block_size,
517
+ mode=mode,
518
+ size=size,
519
+ cache_type=cache_type or self.cache_type,
520
+ cache_options=cache_options or self.cache_options,
521
+ **kw,
522
+ )
523
+ else:
524
+ return HTTPStreamFile(
525
+ self,
526
+ path,
527
+ mode=mode,
528
+ session=self.session,
529
+ **kw,
530
+ )
531
+
532
+ def ukey(self, url):
533
+ """Unique identifier; assume HTTP files are static, unchanging"""
534
+ return tokenize(url, self.kwargs, self.protocol)
535
+
536
+ def info(self, url, **kwargs):
537
+ """Get info of URL
538
+
539
+ Tries to access location via HEAD, and then GET methods, but does
540
+ not fetch the data.
541
+
542
+ It is possible that the server does not supply any size information, in
543
+ which case size will be given as None (and certain operations on the
544
+ corresponding file will not work).
545
+ """
546
+ info = {}
547
+ for policy in ["head", "get"]:
548
+ try:
549
+ info.update(
550
+ _file_info(
551
+ self.encode_url(url),
552
+ size_policy=policy,
553
+ session=self.session,
554
+ **self.kwargs,
555
+ **kwargs,
556
+ )
557
+ )
558
+ if info.get("size") is not None:
559
+ break
560
+ except Exception as exc:
561
+ if policy == "get":
562
+ # If get failed, then raise a FileNotFoundError
563
+ raise FileNotFoundError(url) from exc
564
+ logger.debug(str(exc))
565
+
566
+ return {"name": url, "size": None, **info, "type": "file"}
567
+
568
+ def glob(self, path, maxdepth=None, **kwargs):
569
+ """
570
+ Find files by glob-matching.
571
+
572
+ This implementation is idntical to the one in AbstractFileSystem,
573
+ but "?" is not considered as a character for globbing, because it is
574
+ so common in URLs, often identifying the "query" part.
575
+ """
576
+ import re
577
+
578
+ ends = path.endswith("/")
579
+ path = self._strip_protocol(path)
580
+ indstar = path.find("*") if path.find("*") >= 0 else len(path)
581
+ indbrace = path.find("[") if path.find("[") >= 0 else len(path)
582
+
583
+ ind = min(indstar, indbrace)
584
+
585
+ detail = kwargs.pop("detail", False)
586
+
587
+ if not has_magic(path):
588
+ root = path
589
+ depth = 1
590
+ if ends:
591
+ path += "/*"
592
+ elif self.exists(path):
593
+ if not detail:
594
+ return [path]
595
+ else:
596
+ return {path: self.info(path)}
597
+ else:
598
+ if not detail:
599
+ return [] # glob of non-existent returns empty
600
+ else:
601
+ return {}
602
+ elif "/" in path[:ind]:
603
+ ind2 = path[:ind].rindex("/")
604
+ root = path[: ind2 + 1]
605
+ depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
606
+ else:
607
+ root = ""
608
+ depth = None if "**" in path else path[ind + 1 :].count("/") + 1
609
+
610
+ allpaths = self.find(
611
+ root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
612
+ )
613
+ # Escape characters special to python regex, leaving our supported
614
+ # special characters in place.
615
+ # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
616
+ # for shell globbing details.
617
+ pattern = (
618
+ "^"
619
+ + (
620
+ path.replace("\\", r"\\")
621
+ .replace(".", r"\.")
622
+ .replace("+", r"\+")
623
+ .replace("//", "/")
624
+ .replace("(", r"\(")
625
+ .replace(")", r"\)")
626
+ .replace("|", r"\|")
627
+ .replace("^", r"\^")
628
+ .replace("$", r"\$")
629
+ .replace("{", r"\{")
630
+ .replace("}", r"\}")
631
+ .rstrip("/")
632
+ )
633
+ + "$"
634
+ )
635
+ pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
636
+ pattern = re.sub("[*]", "[^/]*", pattern)
637
+ pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
638
+ out = {
639
+ p: allpaths[p]
640
+ for p in sorted(allpaths)
641
+ if pattern.match(p.replace("//", "/").rstrip("/"))
642
+ }
643
+ if detail:
644
+ return out
645
+ else:
646
+ return list(out)
647
+
648
+ def isdir(self, path):
649
+ # override, since all URLs are (also) files
650
+ try:
651
+ return bool(self.ls(path))
652
+ except (FileNotFoundError, ValueError):
653
+ return False
654
+
655
+
656
+ class HTTPFile(AbstractBufferedFile):
657
+ """
658
+ A file-like object pointing to a remove HTTP(S) resource
659
+
660
+ Supports only reading, with read-ahead of a predermined block-size.
661
+
662
+ In the case that the server does not supply the filesize, only reading of
663
+ the complete file in one go is supported.
664
+
665
+ Parameters
666
+ ----------
667
+ url: str
668
+ Full URL of the remote resource, including the protocol
669
+ session: requests.Session or None
670
+ All calls will be made within this session, to avoid restarting
671
+ connections where the server allows this
672
+ block_size: int or None
673
+ The amount of read-ahead to do, in bytes. Default is 5MB, or the value
674
+ configured for the FileSystem creating this file
675
+ size: None or int
676
+ If given, this is the size of the file in bytes, and we don't attempt
677
+ to call the server to find the value.
678
+ kwargs: all other key-values are passed to requests calls.
679
+ """
680
+
681
+ def __init__(
682
+ self,
683
+ fs,
684
+ url,
685
+ session=None,
686
+ block_size=None,
687
+ mode="rb",
688
+ cache_type="bytes",
689
+ cache_options=None,
690
+ size=None,
691
+ **kwargs,
692
+ ):
693
+ if mode != "rb":
694
+ raise NotImplementedError("File mode not supported")
695
+ self.url = url
696
+ self.session = session
697
+ self.details = {"name": url, "size": size, "type": "file"}
698
+ super().__init__(
699
+ fs=fs,
700
+ path=url,
701
+ mode=mode,
702
+ block_size=block_size,
703
+ cache_type=cache_type,
704
+ cache_options=cache_options,
705
+ **kwargs,
706
+ )
707
+
708
+ def read(self, length=-1):
709
+ """Read bytes from file
710
+
711
+ Parameters
712
+ ----------
713
+ length: int
714
+ Read up to this many bytes. If negative, read all content to end of
715
+ file. If the server has not supplied the filesize, attempting to
716
+ read only part of the data will raise a ValueError.
717
+ """
718
+ if (
719
+ (length < 0 and self.loc == 0) # explicit read all
720
+ # but not when the size is known and fits into a block anyways
721
+ and not (self.size is not None and self.size <= self.blocksize)
722
+ ):
723
+ self._fetch_all()
724
+ if self.size is None:
725
+ if length < 0:
726
+ self._fetch_all()
727
+ else:
728
+ length = min(self.size - self.loc, length)
729
+ return super().read(length)
730
+
731
+ def _fetch_all(self):
732
+ """Read whole file in one shot, without caching
733
+
734
+ This is only called when position is still at zero,
735
+ and read() is called without a byte-count.
736
+ """
737
+ logger.debug(f"Fetch all for {self}")
738
+ if not isinstance(self.cache, AllBytes):
739
+ r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
740
+ r.raise_for_status()
741
+ out = r.content
742
+ self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
743
+ self.size = len(out)
744
+
745
+ def _parse_content_range(self, headers):
746
+ """Parse the Content-Range header"""
747
+ s = headers.get("Content-Range", "")
748
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
749
+ if not m:
750
+ return None, None, None
751
+
752
+ if m[1] == "*":
753
+ start = end = None
754
+ else:
755
+ start, end = [int(x) for x in m[1].split("-")]
756
+ total = None if m[2] == "*" else int(m[2])
757
+ return start, end, total
758
+
759
+ def _fetch_range(self, start, end):
760
+ """Download a block of data
761
+
762
+ The expectation is that the server returns only the requested bytes,
763
+ with HTTP code 206. If this is not the case, we first check the headers,
764
+ and then stream the output - if the data size is bigger than we
765
+ requested, an exception is raised.
766
+ """
767
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
768
+ kwargs = self.kwargs.copy()
769
+ headers = kwargs.pop("headers", {}).copy()
770
+ headers["Range"] = f"bytes={start}-{end - 1}"
771
+ logger.debug("%s : %s", self.url, headers["Range"])
772
+ r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
773
+ if r.status_code == 416:
774
+ # range request outside file
775
+ return b""
776
+ r.raise_for_status()
777
+
778
+ # If the server has handled the range request, it should reply
779
+ # with status 206 (partial content). But we'll guess that a suitable
780
+ # Content-Range header or a Content-Length no more than the
781
+ # requested range also mean we have got the desired range.
782
+ cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
783
+ response_is_range = (
784
+ r.status_code == 206
785
+ or self._parse_content_range(r.headers)[0] == start
786
+ or int(cl) <= end - start
787
+ )
788
+
789
+ if response_is_range:
790
+ # partial content, as expected
791
+ out = r.content
792
+ elif start > 0:
793
+ raise ValueError(
794
+ "The HTTP server doesn't appear to support range requests. "
795
+ "Only reading this file from the beginning is supported. "
796
+ "Open with block_size=0 for a streaming file interface."
797
+ )
798
+ else:
799
+ # Response is not a range, but we want the start of the file,
800
+ # so we can read the required amount anyway.
801
+ cl = 0
802
+ out = []
803
+ for chunk in r.iter_content(2**20, False):
804
+ out.append(chunk)
805
+ cl += len(chunk)
806
+ out = b"".join(out)[: end - start]
807
+ return out
808
+
809
+
810
+ magic_check = re.compile("([*[])")
811
+
812
+
813
+ def has_magic(s):
814
+ match = magic_check.search(s)
815
+ return match is not None
816
+
817
+
818
+ class HTTPStreamFile(AbstractBufferedFile):
819
+ def __init__(self, fs, url, mode="rb", session=None, **kwargs):
820
+ self.url = url
821
+ self.session = session
822
+ if mode != "rb":
823
+ raise ValueError
824
+ self.details = {"name": url, "size": None}
825
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
826
+
827
+ r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
828
+ self.fs._raise_not_found_for_status(r, url)
829
+ self.it = r.iter_content(1024, False)
830
+ self.leftover = b""
831
+
832
+ self.r = r
833
+
834
+ def seek(self, *args, **kwargs):
835
+ raise ValueError("Cannot seek streaming HTTP file")
836
+
837
+ def read(self, num=-1):
838
+ bufs = [self.leftover]
839
+ leng = len(self.leftover)
840
+ while leng < num or num < 0:
841
+ try:
842
+ out = self.it.__next__()
843
+ except StopIteration:
844
+ break
845
+ if out:
846
+ bufs.append(out)
847
+ else:
848
+ break
849
+ leng += len(out)
850
+ out = b"".join(bufs)
851
+ if num >= 0:
852
+ self.leftover = out[num:]
853
+ out = out[:num]
854
+ else:
855
+ self.leftover = b""
856
+ self.loc += len(out)
857
+ return out
858
+
859
+ def close(self):
860
+ self.r.close()
861
+ self.closed = True
862
+
863
+
864
+ def get_range(session, url, start, end, **kwargs):
865
+ # explicit get a range when we know it must be safe
866
+ kwargs = kwargs.copy()
867
+ headers = kwargs.pop("headers", {}).copy()
868
+ headers["Range"] = f"bytes={start}-{end - 1}"
869
+ r = session.get(url, headers=headers, **kwargs)
870
+ r.raise_for_status()
871
+ return r.content
872
+
873
+
874
+ def _file_info(url, session, size_policy="head", **kwargs):
875
+ """Call HEAD on the server to get details about the file (size/checksum etc.)
876
+
877
+ Default operation is to explicitly allow redirects and use encoding
878
+ 'identity' (no compression) to get the true size of the target.
879
+ """
880
+ logger.debug("Retrieve file size for %s", url)
881
+ kwargs = kwargs.copy()
882
+ ar = kwargs.pop("allow_redirects", True)
883
+ head = kwargs.get("headers", {}).copy()
884
+ # TODO: not allowed in JS
885
+ # head["Accept-Encoding"] = "identity"
886
+ kwargs["headers"] = head
887
+
888
+ info = {}
889
+ if size_policy == "head":
890
+ r = session.head(url, allow_redirects=ar, **kwargs)
891
+ elif size_policy == "get":
892
+ r = session.get(url, allow_redirects=ar, **kwargs)
893
+ else:
894
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
895
+ r.raise_for_status()
896
+
897
+ # TODO:
898
+ # recognise lack of 'Accept-Ranges',
899
+ # or 'Accept-Ranges': 'none' (not 'bytes')
900
+ # to mean streaming only, no random access => return None
901
+ if "Content-Length" in r.headers:
902
+ info["size"] = int(r.headers["Content-Length"])
903
+ elif "Content-Range" in r.headers:
904
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
905
+ elif "content-length" in r.headers:
906
+ info["size"] = int(r.headers["content-length"])
907
+ elif "content-range" in r.headers:
908
+ info["size"] = int(r.headers["content-range"].split("/")[1])
909
+
910
+ for checksum_field in ["ETag", "Content-MD5", "Digest"]:
911
+ if r.headers.get(checksum_field):
912
+ info[checksum_field] = r.headers[checksum_field]
913
+
914
+ return info
915
+
916
+
917
+ # importing this is enough to register it
918
+ def register():
919
+ register_implementation("http", HTTPFileSystem, clobber=True)
920
+ register_implementation("https", HTTPFileSystem, clobber=True)
921
+ register_implementation("sync_http", HTTPFileSystem, clobber=True)
922
+ register_implementation("sync_https", HTTPFileSystem, clobber=True)
923
+
924
+
925
+ register()
926
+
927
+
928
+ def unregister():
929
+ from fsspec.implementations.http import HTTPFileSystem
930
+
931
+ register_implementation("http", HTTPFileSystem, clobber=True)
932
+ register_implementation("https", HTTPFileSystem, clobber=True)