fsspec 2025.2.0__py3-none-any.whl → 2025.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,931 @@
1
+ """This file is largely copied from http.py"""
2
+
3
+ import io
4
+ import logging
5
+ import re
6
+ import urllib.error
7
+ import urllib.parse
8
+ from copy import copy
9
+ from json import dumps, loads
10
+ from urllib.parse import urlparse
11
+
12
+ try:
13
+ import yarl
14
+ except (ImportError, ModuleNotFoundError, OSError):
15
+ yarl = False
16
+
17
+ from fsspec.callbacks import _DEFAULT_CALLBACK
18
+ from fsspec.registry import register_implementation
19
+ from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
20
+ from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
21
+
22
+ from ..caching import AllBytes
23
+
24
+ # https://stackoverflow.com/a/15926317/3821154
25
+ ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
26
+ ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
27
+ logger = logging.getLogger("fsspec.http")
28
+
29
+
30
+ class JsHttpException(urllib.error.HTTPError): ...
31
+
32
+
33
+ class StreamIO(io.BytesIO):
34
+ # fake class, so you can set attributes on it
35
+ # will eventually actually stream
36
+ ...
37
+
38
+
39
+ class ResponseProxy:
40
+ """Looks like a requests response"""
41
+
42
+ def __init__(self, req, stream=False):
43
+ self.request = req
44
+ self.stream = stream
45
+ self._data = None
46
+ self._headers = None
47
+
48
+ @property
49
+ def raw(self):
50
+ if self._data is None:
51
+ b = self.request.response.to_bytes()
52
+ if self.stream:
53
+ self._data = StreamIO(b)
54
+ else:
55
+ self._data = b
56
+ return self._data
57
+
58
+ def close(self):
59
+ if hasattr(self, "_data"):
60
+ del self._data
61
+
62
+ @property
63
+ def headers(self):
64
+ if self._headers is None:
65
+ self._headers = dict(
66
+ [
67
+ _.split(": ")
68
+ for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
69
+ ]
70
+ )
71
+ return self._headers
72
+
73
+ @property
74
+ def status_code(self):
75
+ return int(self.request.status)
76
+
77
+ def raise_for_status(self):
78
+ if not self.ok:
79
+ raise JsHttpException(
80
+ self.url, self.status_code, self.reason, self.headers, None
81
+ )
82
+
83
+ def iter_content(self, chunksize, *_, **__):
84
+ while True:
85
+ out = self.raw.read(chunksize)
86
+ if out:
87
+ yield out
88
+ else:
89
+ break
90
+
91
+ @property
92
+ def reason(self):
93
+ return self.request.statusText
94
+
95
+ @property
96
+ def ok(self):
97
+ return self.status_code < 400
98
+
99
+ @property
100
+ def url(self):
101
+ return self.request.response.responseURL
102
+
103
+ @property
104
+ def text(self):
105
+ # TODO: encoding from headers
106
+ return self.content.decode()
107
+
108
+ @property
109
+ def content(self):
110
+ self.stream = False
111
+ return self.raw
112
+
113
+ def json(self):
114
+ return loads(self.text)
115
+
116
+
117
+ class RequestsSessionShim:
118
+ def __init__(self):
119
+ self.headers = {}
120
+
121
+ def request(
122
+ self,
123
+ method,
124
+ url,
125
+ params=None,
126
+ data=None,
127
+ headers=None,
128
+ cookies=None,
129
+ files=None,
130
+ auth=None,
131
+ timeout=None,
132
+ allow_redirects=None,
133
+ proxies=None,
134
+ hooks=None,
135
+ stream=None,
136
+ verify=None,
137
+ cert=None,
138
+ json=None,
139
+ ):
140
+ from js import Blob, XMLHttpRequest
141
+
142
+ logger.debug("JS request: %s %s", method, url)
143
+
144
+ if cert or verify or proxies or files or cookies or hooks:
145
+ raise NotImplementedError
146
+ if data and json:
147
+ raise ValueError("Use json= or data=, not both")
148
+ req = XMLHttpRequest.new()
149
+ extra = auth if auth else ()
150
+ if params:
151
+ url = f"{url}?{urllib.parse.urlencode(params)}"
152
+ req.open(method, url, False, *extra)
153
+ if timeout:
154
+ req.timeout = timeout
155
+ if headers:
156
+ for k, v in headers.items():
157
+ req.setRequestHeader(k, v)
158
+
159
+ req.setRequestHeader("Accept", "application/octet-stream")
160
+ req.responseType = "arraybuffer"
161
+ if json:
162
+ blob = Blob.new([dumps(data)], {type: "application/json"})
163
+ req.send(blob)
164
+ elif data:
165
+ if isinstance(data, io.IOBase):
166
+ data = data.read()
167
+ blob = Blob.new([data], {type: "application/octet-stream"})
168
+ req.send(blob)
169
+ else:
170
+ req.send(None)
171
+ return ResponseProxy(req, stream=stream)
172
+
173
+ def get(self, url, **kwargs):
174
+ return self.request("GET", url, **kwargs)
175
+
176
+ def head(self, url, **kwargs):
177
+ return self.request("HEAD", url, **kwargs)
178
+
179
+ def post(self, url, **kwargs):
180
+ return self.request("POST}", url, **kwargs)
181
+
182
+ def put(self, url, **kwargs):
183
+ return self.request("PUT", url, **kwargs)
184
+
185
+ def patch(self, url, **kwargs):
186
+ return self.request("PATCH", url, **kwargs)
187
+
188
+ def delete(self, url, **kwargs):
189
+ return self.request("DELETE", url, **kwargs)
190
+
191
+
192
+ class HTTPFileSystem(AbstractFileSystem):
193
+ """
194
+ Simple File-System for fetching data via HTTP(S)
195
+
196
+ This is the BLOCKING version of the normal HTTPFileSystem. It uses
197
+ requests in normal python and the JS runtime in pyodide.
198
+
199
+ ***This implementation is extremely experimental, do not use unless
200
+ you are testing pyodide/pyscript integration***
201
+ """
202
+
203
+ protocol = ("http", "https", "sync-http", "sync-https")
204
+ sep = "/"
205
+
206
+ def __init__(
207
+ self,
208
+ simple_links=True,
209
+ block_size=None,
210
+ same_scheme=True,
211
+ cache_type="readahead",
212
+ cache_options=None,
213
+ client_kwargs=None,
214
+ encoded=False,
215
+ **storage_options,
216
+ ):
217
+ """
218
+
219
+ Parameters
220
+ ----------
221
+ block_size: int
222
+ Blocks to read bytes; if 0, will default to raw requests file-like
223
+ objects instead of HTTPFile instances
224
+ simple_links: bool
225
+ If True, will consider both HTML <a> tags and anything that looks
226
+ like a URL; if False, will consider only the former.
227
+ same_scheme: True
228
+ When doing ls/glob, if this is True, only consider paths that have
229
+ http/https matching the input URLs.
230
+ size_policy: this argument is deprecated
231
+ client_kwargs: dict
232
+ Passed to aiohttp.ClientSession, see
233
+ https://docs.aiohttp.org/en/stable/client_reference.html
234
+ For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
235
+ storage_options: key-value
236
+ Any other parameters passed on to requests
237
+ cache_type, cache_options: defaults used in open
238
+ """
239
+ super().__init__(self, **storage_options)
240
+ self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
241
+ self.simple_links = simple_links
242
+ self.same_schema = same_scheme
243
+ self.cache_type = cache_type
244
+ self.cache_options = cache_options
245
+ self.client_kwargs = client_kwargs or {}
246
+ self.encoded = encoded
247
+ self.kwargs = storage_options
248
+
249
+ try:
250
+ import js # noqa: F401
251
+
252
+ logger.debug("Starting JS session")
253
+ self.session = RequestsSessionShim()
254
+ self.js = True
255
+ except Exception as e:
256
+ import requests
257
+
258
+ logger.debug("Starting cpython session because of: %s", e)
259
+ self.session = requests.Session(**(client_kwargs or {}))
260
+ self.js = False
261
+
262
+ request_options = copy(storage_options)
263
+ self.use_listings_cache = request_options.pop("use_listings_cache", False)
264
+ request_options.pop("listings_expiry_time", None)
265
+ request_options.pop("max_paths", None)
266
+ request_options.pop("skip_instance_cache", None)
267
+ self.kwargs = request_options
268
+
269
+ @property
270
+ def fsid(self):
271
+ return "sync-http"
272
+
273
+ def encode_url(self, url):
274
+ if yarl:
275
+ return yarl.URL(url, encoded=self.encoded)
276
+ return url
277
+
278
+ @classmethod
279
+ def _strip_protocol(cls, path: str) -> str:
280
+ """For HTTP, we always want to keep the full URL"""
281
+ path = path.replace("sync-http://", "http://").replace(
282
+ "sync-https://", "https://"
283
+ )
284
+ return path
285
+
286
+ @classmethod
287
+ def _parent(cls, path):
288
+ # override, since _strip_protocol is different for URLs
289
+ par = super()._parent(path)
290
+ if len(par) > 7: # "http://..."
291
+ return par
292
+ return ""
293
+
294
+ def _ls_real(self, url, detail=True, **kwargs):
295
+ # ignoring URL-encoded arguments
296
+ kw = self.kwargs.copy()
297
+ kw.update(kwargs)
298
+ logger.debug(url)
299
+ r = self.session.get(self.encode_url(url), **self.kwargs)
300
+ self._raise_not_found_for_status(r, url)
301
+ text = r.text
302
+ if self.simple_links:
303
+ links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
304
+ else:
305
+ links = [u[2] for u in ex.findall(text)]
306
+ out = set()
307
+ parts = urlparse(url)
308
+ for l in links:
309
+ if isinstance(l, tuple):
310
+ l = l[1]
311
+ if l.startswith("/") and len(l) > 1:
312
+ # absolute URL on this server
313
+ l = parts.scheme + "://" + parts.netloc + l
314
+ if l.startswith("http"):
315
+ if self.same_schema and l.startswith(url.rstrip("/") + "/"):
316
+ out.add(l)
317
+ elif l.replace("https", "http").startswith(
318
+ url.replace("https", "http").rstrip("/") + "/"
319
+ ):
320
+ # allowed to cross http <-> https
321
+ out.add(l)
322
+ else:
323
+ if l not in ["..", "../"]:
324
+ # Ignore FTP-like "parent"
325
+ out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
326
+ if not out and url.endswith("/"):
327
+ out = self._ls_real(url.rstrip("/"), detail=False)
328
+ if detail:
329
+ return [
330
+ {
331
+ "name": u,
332
+ "size": None,
333
+ "type": "directory" if u.endswith("/") else "file",
334
+ }
335
+ for u in out
336
+ ]
337
+ else:
338
+ return sorted(out)
339
+
340
+ def ls(self, url, detail=True, **kwargs):
341
+ if self.use_listings_cache and url in self.dircache:
342
+ out = self.dircache[url]
343
+ else:
344
+ out = self._ls_real(url, detail=detail, **kwargs)
345
+ self.dircache[url] = out
346
+ return out
347
+
348
+ def _raise_not_found_for_status(self, response, url):
349
+ """
350
+ Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
351
+ """
352
+ if response.status_code == 404:
353
+ raise FileNotFoundError(url)
354
+ response.raise_for_status()
355
+
356
+ def cat_file(self, url, start=None, end=None, **kwargs):
357
+ kw = self.kwargs.copy()
358
+ kw.update(kwargs)
359
+ logger.debug(url)
360
+
361
+ if start is not None or end is not None:
362
+ if start == end:
363
+ return b""
364
+ headers = kw.pop("headers", {}).copy()
365
+
366
+ headers["Range"] = self._process_limits(url, start, end)
367
+ kw["headers"] = headers
368
+ r = self.session.get(self.encode_url(url), **kw)
369
+ self._raise_not_found_for_status(r, url)
370
+ return r.content
371
+
372
+ def get_file(
373
+ self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
374
+ ):
375
+ kw = self.kwargs.copy()
376
+ kw.update(kwargs)
377
+ logger.debug(rpath)
378
+ r = self.session.get(self.encode_url(rpath), **kw)
379
+ try:
380
+ size = int(
381
+ r.headers.get("content-length", None)
382
+ or r.headers.get("Content-Length", None)
383
+ )
384
+ except (ValueError, KeyError, TypeError):
385
+ size = None
386
+
387
+ callback.set_size(size)
388
+ self._raise_not_found_for_status(r, rpath)
389
+ if not isfilelike(lpath):
390
+ lpath = open(lpath, "wb")
391
+ for chunk in r.iter_content(chunk_size, decode_unicode=False):
392
+ lpath.write(chunk)
393
+ callback.relative_update(len(chunk))
394
+
395
+ def put_file(
396
+ self,
397
+ lpath,
398
+ rpath,
399
+ chunk_size=5 * 2**20,
400
+ callback=_DEFAULT_CALLBACK,
401
+ method="post",
402
+ **kwargs,
403
+ ):
404
+ def gen_chunks():
405
+ # Support passing arbitrary file-like objects
406
+ # and use them instead of streams.
407
+ if isinstance(lpath, io.IOBase):
408
+ context = nullcontext(lpath)
409
+ use_seek = False # might not support seeking
410
+ else:
411
+ context = open(lpath, "rb")
412
+ use_seek = True
413
+
414
+ with context as f:
415
+ if use_seek:
416
+ callback.set_size(f.seek(0, 2))
417
+ f.seek(0)
418
+ else:
419
+ callback.set_size(getattr(f, "size", None))
420
+
421
+ chunk = f.read(chunk_size)
422
+ while chunk:
423
+ yield chunk
424
+ callback.relative_update(len(chunk))
425
+ chunk = f.read(chunk_size)
426
+
427
+ kw = self.kwargs.copy()
428
+ kw.update(kwargs)
429
+
430
+ method = method.lower()
431
+ if method not in ("post", "put"):
432
+ raise ValueError(
433
+ f"method has to be either 'post' or 'put', not: {method!r}"
434
+ )
435
+
436
+ meth = getattr(self.session, method)
437
+ resp = meth(rpath, data=gen_chunks(), **kw)
438
+ self._raise_not_found_for_status(resp, rpath)
439
+
440
+ def _process_limits(self, url, start, end):
441
+ """Helper for "Range"-based _cat_file"""
442
+ size = None
443
+ suff = False
444
+ if start is not None and start < 0:
445
+ # if start is negative and end None, end is the "suffix length"
446
+ if end is None:
447
+ end = -start
448
+ start = ""
449
+ suff = True
450
+ else:
451
+ size = size or self.info(url)["size"]
452
+ start = size + start
453
+ elif start is None:
454
+ start = 0
455
+ if not suff:
456
+ if end is not None and end < 0:
457
+ if start is not None:
458
+ size = size or self.info(url)["size"]
459
+ end = size + end
460
+ elif end is None:
461
+ end = ""
462
+ if isinstance(end, int):
463
+ end -= 1 # bytes range is inclusive
464
+ return f"bytes={start}-{end}"
465
+
466
+ def exists(self, path, **kwargs):
467
+ kw = self.kwargs.copy()
468
+ kw.update(kwargs)
469
+ try:
470
+ logger.debug(path)
471
+ r = self.session.get(self.encode_url(path), **kw)
472
+ return r.status_code < 400
473
+ except Exception:
474
+ return False
475
+
476
+ def isfile(self, path, **kwargs):
477
+ return self.exists(path, **kwargs)
478
+
479
+ def _open(
480
+ self,
481
+ path,
482
+ mode="rb",
483
+ block_size=None,
484
+ autocommit=None, # XXX: This differs from the base class.
485
+ cache_type=None,
486
+ cache_options=None,
487
+ size=None,
488
+ **kwargs,
489
+ ):
490
+ """Make a file-like object
491
+
492
+ Parameters
493
+ ----------
494
+ path: str
495
+ Full URL with protocol
496
+ mode: string
497
+ must be "rb"
498
+ block_size: int or None
499
+ Bytes to download in one request; use instance value if None. If
500
+ zero, will return a streaming Requests file-like instance.
501
+ kwargs: key-value
502
+ Any other parameters, passed to requests calls
503
+ """
504
+ if mode != "rb":
505
+ raise NotImplementedError
506
+ block_size = block_size if block_size is not None else self.block_size
507
+ kw = self.kwargs.copy()
508
+ kw.update(kwargs)
509
+ size = size or self.info(path, **kwargs)["size"]
510
+ if block_size and size:
511
+ return HTTPFile(
512
+ self,
513
+ path,
514
+ session=self.session,
515
+ block_size=block_size,
516
+ mode=mode,
517
+ size=size,
518
+ cache_type=cache_type or self.cache_type,
519
+ cache_options=cache_options or self.cache_options,
520
+ **kw,
521
+ )
522
+ else:
523
+ return HTTPStreamFile(
524
+ self,
525
+ path,
526
+ mode=mode,
527
+ session=self.session,
528
+ **kw,
529
+ )
530
+
531
+ def ukey(self, url):
532
+ """Unique identifier; assume HTTP files are static, unchanging"""
533
+ return tokenize(url, self.kwargs, self.protocol)
534
+
535
+ def info(self, url, **kwargs):
536
+ """Get info of URL
537
+
538
+ Tries to access location via HEAD, and then GET methods, but does
539
+ not fetch the data.
540
+
541
+ It is possible that the server does not supply any size information, in
542
+ which case size will be given as None (and certain operations on the
543
+ corresponding file will not work).
544
+ """
545
+ info = {}
546
+ for policy in ["head", "get"]:
547
+ try:
548
+ info.update(
549
+ _file_info(
550
+ self.encode_url(url),
551
+ size_policy=policy,
552
+ session=self.session,
553
+ **self.kwargs,
554
+ **kwargs,
555
+ )
556
+ )
557
+ if info.get("size") is not None:
558
+ break
559
+ except Exception as exc:
560
+ if policy == "get":
561
+ # If get failed, then raise a FileNotFoundError
562
+ raise FileNotFoundError(url) from exc
563
+ logger.debug(str(exc))
564
+
565
+ return {"name": url, "size": None, **info, "type": "file"}
566
+
567
+ def glob(self, path, maxdepth=None, **kwargs):
568
+ """
569
+ Find files by glob-matching.
570
+
571
+ This implementation is idntical to the one in AbstractFileSystem,
572
+ but "?" is not considered as a character for globbing, because it is
573
+ so common in URLs, often identifying the "query" part.
574
+ """
575
+ import re
576
+
577
+ ends = path.endswith("/")
578
+ path = self._strip_protocol(path)
579
+ indstar = path.find("*") if path.find("*") >= 0 else len(path)
580
+ indbrace = path.find("[") if path.find("[") >= 0 else len(path)
581
+
582
+ ind = min(indstar, indbrace)
583
+
584
+ detail = kwargs.pop("detail", False)
585
+
586
+ if not has_magic(path):
587
+ root = path
588
+ depth = 1
589
+ if ends:
590
+ path += "/*"
591
+ elif self.exists(path):
592
+ if not detail:
593
+ return [path]
594
+ else:
595
+ return {path: self.info(path)}
596
+ else:
597
+ if not detail:
598
+ return [] # glob of non-existent returns empty
599
+ else:
600
+ return {}
601
+ elif "/" in path[:ind]:
602
+ ind2 = path[:ind].rindex("/")
603
+ root = path[: ind2 + 1]
604
+ depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
605
+ else:
606
+ root = ""
607
+ depth = None if "**" in path else path[ind + 1 :].count("/") + 1
608
+
609
+ allpaths = self.find(
610
+ root, maxdepth=maxdepth or depth, withdirs=True, detail=True, **kwargs
611
+ )
612
+ # Escape characters special to python regex, leaving our supported
613
+ # special characters in place.
614
+ # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
615
+ # for shell globbing details.
616
+ pattern = (
617
+ "^"
618
+ + (
619
+ path.replace("\\", r"\\")
620
+ .replace(".", r"\.")
621
+ .replace("+", r"\+")
622
+ .replace("//", "/")
623
+ .replace("(", r"\(")
624
+ .replace(")", r"\)")
625
+ .replace("|", r"\|")
626
+ .replace("^", r"\^")
627
+ .replace("$", r"\$")
628
+ .replace("{", r"\{")
629
+ .replace("}", r"\}")
630
+ .rstrip("/")
631
+ )
632
+ + "$"
633
+ )
634
+ pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
635
+ pattern = re.sub("[*]", "[^/]*", pattern)
636
+ pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
637
+ out = {
638
+ p: allpaths[p]
639
+ for p in sorted(allpaths)
640
+ if pattern.match(p.replace("//", "/").rstrip("/"))
641
+ }
642
+ if detail:
643
+ return out
644
+ else:
645
+ return list(out)
646
+
647
+ def isdir(self, path):
648
+ # override, since all URLs are (also) files
649
+ try:
650
+ return bool(self.ls(path))
651
+ except (FileNotFoundError, ValueError):
652
+ return False
653
+
654
+
655
+ class HTTPFile(AbstractBufferedFile):
656
+ """
657
+ A file-like object pointing to a remove HTTP(S) resource
658
+
659
+ Supports only reading, with read-ahead of a predermined block-size.
660
+
661
+ In the case that the server does not supply the filesize, only reading of
662
+ the complete file in one go is supported.
663
+
664
+ Parameters
665
+ ----------
666
+ url: str
667
+ Full URL of the remote resource, including the protocol
668
+ session: requests.Session or None
669
+ All calls will be made within this session, to avoid restarting
670
+ connections where the server allows this
671
+ block_size: int or None
672
+ The amount of read-ahead to do, in bytes. Default is 5MB, or the value
673
+ configured for the FileSystem creating this file
674
+ size: None or int
675
+ If given, this is the size of the file in bytes, and we don't attempt
676
+ to call the server to find the value.
677
+ kwargs: all other key-values are passed to requests calls.
678
+ """
679
+
680
+ def __init__(
681
+ self,
682
+ fs,
683
+ url,
684
+ session=None,
685
+ block_size=None,
686
+ mode="rb",
687
+ cache_type="bytes",
688
+ cache_options=None,
689
+ size=None,
690
+ **kwargs,
691
+ ):
692
+ if mode != "rb":
693
+ raise NotImplementedError("File mode not supported")
694
+ self.url = url
695
+ self.session = session
696
+ self.details = {"name": url, "size": size, "type": "file"}
697
+ super().__init__(
698
+ fs=fs,
699
+ path=url,
700
+ mode=mode,
701
+ block_size=block_size,
702
+ cache_type=cache_type,
703
+ cache_options=cache_options,
704
+ **kwargs,
705
+ )
706
+
707
+ def read(self, length=-1):
708
+ """Read bytes from file
709
+
710
+ Parameters
711
+ ----------
712
+ length: int
713
+ Read up to this many bytes. If negative, read all content to end of
714
+ file. If the server has not supplied the filesize, attempting to
715
+ read only part of the data will raise a ValueError.
716
+ """
717
+ if (
718
+ (length < 0 and self.loc == 0) # explicit read all
719
+ # but not when the size is known and fits into a block anyways
720
+ and not (self.size is not None and self.size <= self.blocksize)
721
+ ):
722
+ self._fetch_all()
723
+ if self.size is None:
724
+ if length < 0:
725
+ self._fetch_all()
726
+ else:
727
+ length = min(self.size - self.loc, length)
728
+ return super().read(length)
729
+
730
+ def _fetch_all(self):
731
+ """Read whole file in one shot, without caching
732
+
733
+ This is only called when position is still at zero,
734
+ and read() is called without a byte-count.
735
+ """
736
+ logger.debug(f"Fetch all for {self}")
737
+ if not isinstance(self.cache, AllBytes):
738
+ r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
739
+ r.raise_for_status()
740
+ out = r.content
741
+ self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
742
+ self.size = len(out)
743
+
744
+ def _parse_content_range(self, headers):
745
+ """Parse the Content-Range header"""
746
+ s = headers.get("Content-Range", "")
747
+ m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
748
+ if not m:
749
+ return None, None, None
750
+
751
+ if m[1] == "*":
752
+ start = end = None
753
+ else:
754
+ start, end = [int(x) for x in m[1].split("-")]
755
+ total = None if m[2] == "*" else int(m[2])
756
+ return start, end, total
757
+
758
+ def _fetch_range(self, start, end):
759
+ """Download a block of data
760
+
761
+ The expectation is that the server returns only the requested bytes,
762
+ with HTTP code 206. If this is not the case, we first check the headers,
763
+ and then stream the output - if the data size is bigger than we
764
+ requested, an exception is raised.
765
+ """
766
+ logger.debug(f"Fetch range for {self}: {start}-{end}")
767
+ kwargs = self.kwargs.copy()
768
+ headers = kwargs.pop("headers", {}).copy()
769
+ headers["Range"] = f"bytes={start}-{end - 1}"
770
+ logger.debug("%s : %s", self.url, headers["Range"])
771
+ r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
772
+ if r.status_code == 416:
773
+ # range request outside file
774
+ return b""
775
+ r.raise_for_status()
776
+
777
+ # If the server has handled the range request, it should reply
778
+ # with status 206 (partial content). But we'll guess that a suitable
779
+ # Content-Range header or a Content-Length no more than the
780
+ # requested range also mean we have got the desired range.
781
+ cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
782
+ response_is_range = (
783
+ r.status_code == 206
784
+ or self._parse_content_range(r.headers)[0] == start
785
+ or int(cl) <= end - start
786
+ )
787
+
788
+ if response_is_range:
789
+ # partial content, as expected
790
+ out = r.content
791
+ elif start > 0:
792
+ raise ValueError(
793
+ "The HTTP server doesn't appear to support range requests. "
794
+ "Only reading this file from the beginning is supported. "
795
+ "Open with block_size=0 for a streaming file interface."
796
+ )
797
+ else:
798
+ # Response is not a range, but we want the start of the file,
799
+ # so we can read the required amount anyway.
800
+ cl = 0
801
+ out = []
802
+ for chunk in r.iter_content(2**20, False):
803
+ out.append(chunk)
804
+ cl += len(chunk)
805
+ out = b"".join(out)[: end - start]
806
+ return out
807
+
808
+
809
+ magic_check = re.compile("([*[])")
810
+
811
+
812
+ def has_magic(s):
813
+ match = magic_check.search(s)
814
+ return match is not None
815
+
816
+
817
+ class HTTPStreamFile(AbstractBufferedFile):
818
+ def __init__(self, fs, url, mode="rb", session=None, **kwargs):
819
+ self.url = url
820
+ self.session = session
821
+ if mode != "rb":
822
+ raise ValueError
823
+ self.details = {"name": url, "size": None}
824
+ super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
825
+
826
+ r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
827
+ self.fs._raise_not_found_for_status(r, url)
828
+ self.it = r.iter_content(1024, False)
829
+ self.leftover = b""
830
+
831
+ self.r = r
832
+
833
+ def seek(self, *args, **kwargs):
834
+ raise ValueError("Cannot seek streaming HTTP file")
835
+
836
+ def read(self, num=-1):
837
+ bufs = [self.leftover]
838
+ leng = len(self.leftover)
839
+ while leng < num or num < 0:
840
+ try:
841
+ out = self.it.__next__()
842
+ except StopIteration:
843
+ break
844
+ if out:
845
+ bufs.append(out)
846
+ else:
847
+ break
848
+ leng += len(out)
849
+ out = b"".join(bufs)
850
+ if num >= 0:
851
+ self.leftover = out[num:]
852
+ out = out[:num]
853
+ else:
854
+ self.leftover = b""
855
+ self.loc += len(out)
856
+ return out
857
+
858
+ def close(self):
859
+ self.r.close()
860
+ self.closed = True
861
+
862
+
863
+ def get_range(session, url, start, end, **kwargs):
864
+ # explicit get a range when we know it must be safe
865
+ kwargs = kwargs.copy()
866
+ headers = kwargs.pop("headers", {}).copy()
867
+ headers["Range"] = f"bytes={start}-{end - 1}"
868
+ r = session.get(url, headers=headers, **kwargs)
869
+ r.raise_for_status()
870
+ return r.content
871
+
872
+
873
+ def _file_info(url, session, size_policy="head", **kwargs):
874
+ """Call HEAD on the server to get details about the file (size/checksum etc.)
875
+
876
+ Default operation is to explicitly allow redirects and use encoding
877
+ 'identity' (no compression) to get the true size of the target.
878
+ """
879
+ logger.debug("Retrieve file size for %s", url)
880
+ kwargs = kwargs.copy()
881
+ ar = kwargs.pop("allow_redirects", True)
882
+ head = kwargs.get("headers", {}).copy()
883
+ # TODO: not allowed in JS
884
+ # head["Accept-Encoding"] = "identity"
885
+ kwargs["headers"] = head
886
+
887
+ info = {}
888
+ if size_policy == "head":
889
+ r = session.head(url, allow_redirects=ar, **kwargs)
890
+ elif size_policy == "get":
891
+ r = session.get(url, allow_redirects=ar, **kwargs)
892
+ else:
893
+ raise TypeError(f'size_policy must be "head" or "get", got {size_policy}')
894
+ r.raise_for_status()
895
+
896
+ # TODO:
897
+ # recognise lack of 'Accept-Ranges',
898
+ # or 'Accept-Ranges': 'none' (not 'bytes')
899
+ # to mean streaming only, no random access => return None
900
+ if "Content-Length" in r.headers:
901
+ info["size"] = int(r.headers["Content-Length"])
902
+ elif "Content-Range" in r.headers:
903
+ info["size"] = int(r.headers["Content-Range"].split("/")[1])
904
+ elif "content-length" in r.headers:
905
+ info["size"] = int(r.headers["content-length"])
906
+ elif "content-range" in r.headers:
907
+ info["size"] = int(r.headers["content-range"].split("/")[1])
908
+
909
+ for checksum_field in ["ETag", "Content-MD5", "Digest"]:
910
+ if r.headers.get(checksum_field):
911
+ info[checksum_field] = r.headers[checksum_field]
912
+
913
+ return info
914
+
915
+
916
+ # importing this is enough to register it
917
+ def register():
918
+ register_implementation("http", HTTPFileSystem, clobber=True)
919
+ register_implementation("https", HTTPFileSystem, clobber=True)
920
+ register_implementation("sync-http", HTTPFileSystem, clobber=True)
921
+ register_implementation("sync-https", HTTPFileSystem, clobber=True)
922
+
923
+
924
+ register()
925
+
926
+
927
+ def unregister():
928
+ from fsspec.implementations.http import HTTPFileSystem
929
+
930
+ register_implementation("http", HTTPFileSystem, clobber=True)
931
+ register_implementation("https", HTTPFileSystem, clobber=True)