fsspec 2023.9.2__py3-none-any.whl → 2023.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,882 +0,0 @@
1
- from __future__ import absolute_import, division, print_function
2
-
3
- import io
4
- import logging
5
- import re
6
- import urllib.error
7
- import urllib.parse
8
- from copy import copy
9
- from json import dumps, loads
10
- from urllib.parse import urlparse
11
-
12
- try:
13
- import yarl
14
- except (ImportError, ModuleNotFoundError, OSError):
15
- yarl = False
16
-
17
- from fsspec.callbacks import _DEFAULT_CALLBACK
18
- from fsspec.registry import register_implementation
19
- from fsspec.spec import AbstractBufferedFile, AbstractFileSystem
20
- from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
21
-
22
- from ..caching import AllBytes
23
-
24
- # https://stackoverflow.com/a/15926317/3821154
25
- ex = re.compile(r"""<(a|A)\s+(?:[^>]*?\s+)?(href|HREF)=["'](?P<url>[^"']+)""")
26
- ex2 = re.compile(r"""(?P<url>http[s]?://[-a-zA-Z0-9@:%_+.~#?&/=]+)""")
27
- logger = logging.getLogger("fsspec.http")
28
-
29
-
30
- class JsHttpException(urllib.error.HTTPError):
31
- ...
32
-
33
-
34
- class StreamIO(io.BytesIO):
35
- # fake class, so you can set attributes on it
36
- # will eventually actually stream
37
- ...
38
-
39
-
40
- class ResponseProxy:
41
- """Looks like a requests response"""
42
-
43
- def __init__(self, req, stream=False):
44
- self.request = req
45
- self.stream = stream
46
- self._data = None
47
- self._headers = None
48
-
49
- @property
50
- def raw(self):
51
- if self._data is None:
52
- b = self.request.response.to_bytes()
53
- if self.stream:
54
- self._data = StreamIO(b)
55
- else:
56
- self._data = b
57
- return self._data
58
-
59
- def close(self):
60
- if hasattr(self, "_data"):
61
- del self._data
62
-
63
- @property
64
- def headers(self):
65
- if self._headers is None:
66
- self._headers = dict(
67
- [
68
- _.split(": ")
69
- for _ in self.request.getAllResponseHeaders().strip().split("\r\n")
70
- ]
71
- )
72
- return self._headers
73
-
74
- @property
75
- def status_code(self):
76
- return int(self.request.status)
77
-
78
- def raise_for_status(self):
79
- if not self.ok:
80
- raise JsHttpException(
81
- self.url, self.status_code, self.reason, self.headers, None
82
- )
83
-
84
- @property
85
- def reason(self):
86
- return self.request.statusText
87
-
88
- @property
89
- def ok(self):
90
- return self.status_code < 400
91
-
92
- @property
93
- def url(self):
94
- return self.request.response.responseURL
95
-
96
- @property
97
- def text(self):
98
- # TODO: encoding from headers
99
- return self.content.decode()
100
-
101
- @property
102
- def content(self):
103
- self.stream = False
104
- return self.raw
105
-
106
- @property
107
- def json(self):
108
- return loads(self.text)
109
-
110
-
111
- class RequestsSessionShim:
112
- def __init__(self):
113
- self.headers = {}
114
-
115
- def request(
116
- self,
117
- method,
118
- url,
119
- params=None,
120
- data=None,
121
- headers=None,
122
- cookies=None,
123
- files=None,
124
- auth=None,
125
- timeout=None,
126
- allow_redirects=None,
127
- proxies=None,
128
- hooks=None,
129
- stream=None,
130
- verify=None,
131
- cert=None,
132
- json=None,
133
- ):
134
- import js
135
- from js import Blob, XMLHttpRequest
136
-
137
- if hasattr(js, "document"):
138
- raise RuntimeError("Filesystem can only be run from a worker, not main")
139
-
140
- logger.debug("JS request: %s %s", method, url)
141
-
142
- if cert or verify or proxies or files or cookies or hooks:
143
- raise NotImplementedError
144
- if data and json:
145
- raise ValueError("Use json= or data=, not both")
146
- req = XMLHttpRequest.new()
147
- extra = auth if auth else ()
148
- if params:
149
- url = f"{url}?{urllib.parse.urlencode(params)}"
150
- req.open(method, url, False, *extra)
151
- if timeout:
152
- req.timeout = timeout
153
- if headers:
154
- for k, v in headers.items():
155
- req.setRequestHeader(k, v)
156
-
157
- req.setRequestHeader("Accept", "application/octet-stream")
158
- req.responseType = "arraybuffer"
159
- if json:
160
- blob = Blob.new([dumps(data)], {type: "application/json"})
161
- req.send(blob)
162
- elif data:
163
- if isinstance(data, io.IOBase):
164
- data = data.read()
165
- blob = Blob.new([data], {type: "application/octet-stream"})
166
- req.send(blob)
167
- else:
168
- req.send(None)
169
- return ResponseProxy(req, stream=stream)
170
-
171
- def get(self, url, **kwargs):
172
- return self.request("GET", url, **kwargs)
173
-
174
- def head(self, url, **kwargs):
175
- return self.request("HEAD", url, **kwargs)
176
-
177
- def post(self, url, **kwargs):
178
- return self.request("POST}", url, **kwargs)
179
-
180
- def put(self, url, **kwargs):
181
- return self.request("PUT", url, **kwargs)
182
-
183
- def patch(self, url, **kwargs):
184
- return self.request("PATCH", url, **kwargs)
185
-
186
- def delete(self, url, **kwargs):
187
- return self.request("DELETE", url, **kwargs)
188
-
189
-
190
- class HTTPFileSystem(AbstractFileSystem):
191
- """
192
- Simple File-System for fetching data via HTTP(S)
193
-
194
- ``ls()`` is implemented by loading the parent page and doing a regex
195
- match on the result. If simple_link=True, anything of the form
196
- "http(s)://server.com/stuff?thing=other"; otherwise only links within
197
- HTML href tags will be used.
198
- """
199
-
200
- sep = "/"
201
-
202
- def __init__(
203
- self,
204
- simple_links=True,
205
- block_size=None,
206
- same_scheme=True,
207
- cache_type="readahead",
208
- cache_options=None,
209
- client_kwargs=None,
210
- encoded=False,
211
- **storage_options,
212
- ):
213
- """
214
-
215
- Parameters
216
- ----------
217
- block_size: int
218
- Blocks to read bytes; if 0, will default to raw requests file-like
219
- objects instead of HTTPFile instances
220
- simple_links: bool
221
- If True, will consider both HTML <a> tags and anything that looks
222
- like a URL; if False, will consider only the former.
223
- same_scheme: True
224
- When doing ls/glob, if this is True, only consider paths that have
225
- http/https matching the input URLs.
226
- size_policy: this argument is deprecated
227
- client_kwargs: dict
228
- Passed to aiohttp.ClientSession, see
229
- https://docs.aiohttp.org/en/stable/client_reference.html
230
- For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}``
231
- storage_options: key-value
232
- Any other parameters passed on to requests
233
- cache_type, cache_options: defaults used in open
234
- """
235
- super().__init__(self, **storage_options)
236
- self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
237
- self.simple_links = simple_links
238
- self.same_schema = same_scheme
239
- self.cache_type = cache_type
240
- self.cache_options = cache_options
241
- self.client_kwargs = client_kwargs or {}
242
- self.encoded = encoded
243
- self.kwargs = storage_options
244
-
245
- try:
246
- import js # noqa: F401
247
-
248
- logger.debug("Starting JS session")
249
- self.session = RequestsSessionShim()
250
- self.js = True
251
- except Exception as e:
252
- import requests
253
-
254
- logger.debug("Starting cpython session because of: %s", e)
255
- self.session = requests.Session(**(client_kwargs or {}))
256
- self.js = False
257
-
258
- request_options = copy(storage_options)
259
- self.use_listings_cache = request_options.pop("use_listings_cache", False)
260
- request_options.pop("listings_expiry_time", None)
261
- request_options.pop("max_paths", None)
262
- request_options.pop("skip_instance_cache", None)
263
- self.kwargs = request_options
264
-
265
- @property
266
- def fsid(self):
267
- return "http"
268
-
269
- def encode_url(self, url):
270
- if yarl:
271
- return yarl.URL(url, encoded=self.encoded)
272
- return url
273
-
274
- @classmethod
275
- def _strip_protocol(cls, path):
276
- """For HTTP, we always want to keep the full URL"""
277
- return path
278
-
279
- @classmethod
280
- def _parent(cls, path):
281
- # override, since _strip_protocol is different for URLs
282
- par = super()._parent(path)
283
- if len(par) > 7: # "http://..."
284
- return par
285
- return ""
286
-
287
- def _ls_real(self, url, detail=True, **kwargs):
288
- # ignoring URL-encoded arguments
289
- kw = self.kwargs.copy()
290
- kw.update(kwargs)
291
- logger.debug(url)
292
- r = self.session.get(self.encode_url(url), **self.kwargs)
293
- self._raise_not_found_for_status(r, url)
294
- text = r.text
295
- if self.simple_links:
296
- links = ex2.findall(text) + [u[2] for u in ex.findall(text)]
297
- else:
298
- links = [u[2] for u in ex.findall(text)]
299
- out = set()
300
- parts = urlparse(url)
301
- for l in links:
302
- if isinstance(l, tuple):
303
- l = l[1]
304
- if l.startswith("/") and len(l) > 1:
305
- # absolute URL on this server
306
- l = parts.scheme + "://" + parts.netloc + l
307
- if l.startswith("http"):
308
- if self.same_schema and l.startswith(url.rstrip("/") + "/"):
309
- out.add(l)
310
- elif l.replace("https", "http").startswith(
311
- url.replace("https", "http").rstrip("/") + "/"
312
- ):
313
- # allowed to cross http <-> https
314
- out.add(l)
315
- else:
316
- if l not in ["..", "../"]:
317
- # Ignore FTP-like "parent"
318
- out.add("/".join([url.rstrip("/"), l.lstrip("/")]))
319
- if not out and url.endswith("/"):
320
- out = self._ls_real(url.rstrip("/"), detail=False)
321
- if detail:
322
- return [
323
- {
324
- "name": u,
325
- "size": None,
326
- "type": "directory" if u.endswith("/") else "file",
327
- }
328
- for u in out
329
- ]
330
- else:
331
- return list(sorted(out))
332
-
333
- def ls(self, url, detail=True, **kwargs):
334
-
335
- if self.use_listings_cache and url in self.dircache:
336
- out = self.dircache[url]
337
- else:
338
- out = self._ls_real(url, detail=detail, **kwargs)
339
- self.dircache[url] = out
340
- return out
341
-
342
- def _raise_not_found_for_status(self, response, url):
343
- """
344
- Raises FileNotFoundError for 404s, otherwise uses raise_for_status.
345
- """
346
- if response.status_code == 404:
347
- raise FileNotFoundError(url)
348
- response.raise_for_status()
349
-
350
- def cat_file(self, url, start=None, end=None, **kwargs):
351
- kw = self.kwargs.copy()
352
- kw.update(kwargs)
353
- logger.debug(url)
354
-
355
- if start is not None or end is not None:
356
- if start == end:
357
- return b""
358
- headers = kw.pop("headers", {}).copy()
359
-
360
- headers["Range"] = self._process_limits(url, start, end)
361
- kw["headers"] = headers
362
- r = self.session.get(self.encode_url(url), **kw)
363
- self._raise_not_found_for_status(r, url)
364
- return r.content
365
-
366
- def get_file(
367
- self, rpath, lpath, chunk_size=5 * 2**20, callback=_DEFAULT_CALLBACK, **kwargs
368
- ):
369
- kw = self.kwargs.copy()
370
- kw.update(kwargs)
371
- logger.debug(rpath)
372
- r = self.session.get(self.encode_url(rpath), **kw)
373
- try:
374
- size = int(r.headers["content-length"])
375
- except (ValueError, KeyError):
376
- size = None
377
-
378
- callback.set_size(size)
379
- self._raise_not_found_for_status(r, rpath)
380
- if not isfilelike(lpath):
381
- lpath = open(lpath, "wb")
382
- chunk = True
383
- while chunk:
384
- r.raw.decode_content = True
385
- chunk = r.raw.read(chunk_size)
386
- lpath.write(chunk)
387
- callback.relative_update(len(chunk))
388
-
389
- def put_file(
390
- self,
391
- lpath,
392
- rpath,
393
- chunk_size=5 * 2**20,
394
- callback=_DEFAULT_CALLBACK,
395
- method="post",
396
- **kwargs,
397
- ):
398
- def gen_chunks():
399
- # Support passing arbitrary file-like objects
400
- # and use them instead of streams.
401
- if isinstance(lpath, io.IOBase):
402
- context = nullcontext(lpath)
403
- use_seek = False # might not support seeking
404
- else:
405
- context = open(lpath, "rb")
406
- use_seek = True
407
-
408
- with context as f:
409
- if use_seek:
410
- callback.set_size(f.seek(0, 2))
411
- f.seek(0)
412
- else:
413
- callback.set_size(getattr(f, "size", None))
414
-
415
- chunk = f.read(chunk_size)
416
- while chunk:
417
- yield chunk
418
- callback.relative_update(len(chunk))
419
- chunk = f.read(chunk_size)
420
-
421
- kw = self.kwargs.copy()
422
- kw.update(kwargs)
423
-
424
- method = method.lower()
425
- if method not in ("post", "put"):
426
- raise ValueError(
427
- f"method has to be either 'post' or 'put', not: {method!r}"
428
- )
429
-
430
- meth = getattr(self.session, method)
431
- resp = meth(rpath, data=gen_chunks(), **kw)
432
- self._raise_not_found_for_status(resp, rpath)
433
-
434
- def exists(self, path, **kwargs):
435
- kw = self.kwargs.copy()
436
- kw.update(kwargs)
437
- try:
438
- logger.debug(path)
439
- r = self.session.get(self.encode_url(path), **kw)
440
- return r.status_code < 400
441
- except Exception:
442
- return False
443
-
444
- def isfile(self, path, **kwargs):
445
- return self.exists(path, **kwargs)
446
-
447
- def _open(
448
- self,
449
- path,
450
- mode="rb",
451
- block_size=None,
452
- autocommit=None, # XXX: This differs from the base class.
453
- cache_type=None,
454
- cache_options=None,
455
- size=None,
456
- **kwargs,
457
- ):
458
- """Make a file-like object
459
-
460
- Parameters
461
- ----------
462
- path: str
463
- Full URL with protocol
464
- mode: string
465
- must be "rb"
466
- block_size: int or None
467
- Bytes to download in one request; use instance value if None. If
468
- zero, will return a streaming Requests file-like instance.
469
- kwargs: key-value
470
- Any other parameters, passed to requests calls
471
- """
472
- if mode != "rb":
473
- raise NotImplementedError
474
- block_size = block_size if block_size is not None else self.block_size
475
- kw = self.kwargs.copy()
476
- kw.update(kwargs)
477
- size = size or self.info(path, **kwargs)["size"]
478
- if block_size and size:
479
- return HTTPFile(
480
- self,
481
- path,
482
- session=self.session,
483
- block_size=block_size,
484
- mode=mode,
485
- size=size,
486
- cache_type=cache_type or self.cache_type,
487
- cache_options=cache_options or self.cache_options,
488
- **kw,
489
- )
490
- else:
491
- return HTTPStreamFile(
492
- self,
493
- path,
494
- mode=mode,
495
- session=self.session,
496
- **kw,
497
- )
498
-
499
- def ukey(self, url):
500
- """Unique identifier; assume HTTP files are static, unchanging"""
501
- return tokenize(url, self.kwargs, self.protocol)
502
-
503
- def info(self, url, **kwargs):
504
- """Get info of URL
505
-
506
- Tries to access location via HEAD, and then GET methods, but does
507
- not fetch the data.
508
-
509
- It is possible that the server does not supply any size information, in
510
- which case size will be given as None (and certain operations on the
511
- corresponding file will not work).
512
- """
513
- info = {}
514
- for policy in ["head", "get"]:
515
- try:
516
- info.update(
517
- _file_info(
518
- self.encode_url(url),
519
- size_policy=policy,
520
- session=self.session,
521
- **self.kwargs,
522
- **kwargs,
523
- )
524
- )
525
- if info.get("size") is not None:
526
- break
527
- except Exception as exc:
528
- if policy == "get":
529
- # If get failed, then raise a FileNotFoundError
530
- raise FileNotFoundError(url) from exc
531
- logger.debug(str(exc))
532
-
533
- return {"name": url, "size": None, **info, "type": "file"}
534
-
535
- def glob(self, path, **kwargs):
536
- """
537
- Find files by glob-matching.
538
-
539
- This implementation is idntical to the one in AbstractFileSystem,
540
- but "?" is not considered as a character for globbing, because it is
541
- so common in URLs, often identifying the "query" part.
542
- """
543
- import re
544
-
545
- ends = path.endswith("/")
546
- path = self._strip_protocol(path)
547
- indstar = path.find("*") if path.find("*") >= 0 else len(path)
548
- indbrace = path.find("[") if path.find("[") >= 0 else len(path)
549
-
550
- ind = min(indstar, indbrace)
551
-
552
- detail = kwargs.pop("detail", False)
553
-
554
- if not has_magic(path):
555
- root = path
556
- depth = 1
557
- if ends:
558
- path += "/*"
559
- elif self.exists(path):
560
- if not detail:
561
- return [path]
562
- else:
563
- return {path: self.info(path)}
564
- else:
565
- if not detail:
566
- return [] # glob of non-existent returns empty
567
- else:
568
- return {}
569
- elif "/" in path[:ind]:
570
- ind2 = path[:ind].rindex("/")
571
- root = path[: ind2 + 1]
572
- depth = None if "**" in path else path[ind2 + 1 :].count("/") + 1
573
- else:
574
- root = ""
575
- depth = None if "**" in path else path[ind + 1 :].count("/") + 1
576
-
577
- allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
578
- # Escape characters special to python regex, leaving our supported
579
- # special characters in place.
580
- # See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
581
- # for shell globbing details.
582
- pattern = (
583
- "^"
584
- + (
585
- path.replace("\\", r"\\")
586
- .replace(".", r"\.")
587
- .replace("+", r"\+")
588
- .replace("//", "/")
589
- .replace("(", r"\(")
590
- .replace(")", r"\)")
591
- .replace("|", r"\|")
592
- .replace("^", r"\^")
593
- .replace("$", r"\$")
594
- .replace("{", r"\{")
595
- .replace("}", r"\}")
596
- .rstrip("/")
597
- )
598
- + "$"
599
- )
600
- pattern = re.sub("[*]{2}", "=PLACEHOLDER=", pattern)
601
- pattern = re.sub("[*]", "[^/]*", pattern)
602
- pattern = re.compile(pattern.replace("=PLACEHOLDER=", ".*"))
603
- out = {
604
- p: allpaths[p]
605
- for p in sorted(allpaths)
606
- if pattern.match(p.replace("//", "/").rstrip("/"))
607
- }
608
- if detail:
609
- return out
610
- else:
611
- return list(out)
612
-
613
- def isdir(self, path):
614
- # override, since all URLs are (also) files
615
- try:
616
- return bool(self._ls(path))
617
- except (FileNotFoundError, ValueError):
618
- return False
619
-
620
-
621
- class HTTPFile(AbstractBufferedFile):
622
- """
623
- A file-like object pointing to a remove HTTP(S) resource
624
-
625
- Supports only reading, with read-ahead of a predermined block-size.
626
-
627
- In the case that the server does not supply the filesize, only reading of
628
- the complete file in one go is supported.
629
-
630
- Parameters
631
- ----------
632
- url: str
633
- Full URL of the remote resource, including the protocol
634
- session: requests.Session or None
635
- All calls will be made within this session, to avoid restarting
636
- connections where the server allows this
637
- block_size: int or None
638
- The amount of read-ahead to do, in bytes. Default is 5MB, or the value
639
- configured for the FileSystem creating this file
640
- size: None or int
641
- If given, this is the size of the file in bytes, and we don't attempt
642
- to call the server to find the value.
643
- kwargs: all other key-values are passed to requests calls.
644
- """
645
-
646
- def __init__(
647
- self,
648
- fs,
649
- url,
650
- session=None,
651
- block_size=None,
652
- mode="rb",
653
- cache_type="bytes",
654
- cache_options=None,
655
- size=None,
656
- **kwargs,
657
- ):
658
- if mode != "rb":
659
- raise NotImplementedError("File mode not supported")
660
- self.url = url
661
- self.session = session
662
- self.details = {"name": url, "size": size, "type": "file"}
663
- super().__init__(
664
- fs=fs,
665
- path=url,
666
- mode=mode,
667
- block_size=block_size,
668
- cache_type=cache_type,
669
- cache_options=cache_options,
670
- **kwargs,
671
- )
672
-
673
- def read(self, length=-1):
674
- """Read bytes from file
675
-
676
- Parameters
677
- ----------
678
- length: int
679
- Read up to this many bytes. If negative, read all content to end of
680
- file. If the server has not supplied the filesize, attempting to
681
- read only part of the data will raise a ValueError.
682
- """
683
- if (
684
- (length < 0 and self.loc == 0) # explicit read all
685
- # but not when the size is known and fits into a block anyways
686
- and not (self.size is not None and self.size <= self.blocksize)
687
- ):
688
- self._fetch_all()
689
- if self.size is None:
690
- if length < 0:
691
- self._fetch_all()
692
- else:
693
- length = min(self.size - self.loc, length)
694
- return super().read(length)
695
-
696
- def _fetch_all(self):
697
- """Read whole file in one shot, without caching
698
-
699
- This is only called when position is still at zero,
700
- and read() is called without a byte-count.
701
- """
702
- logger.debug(f"Fetch all for {self}")
703
- if not isinstance(self.cache, AllBytes):
704
- r = self.session.get(self.fs.encode_url(self.url), **self.kwargs)
705
- r.raise_for_status()
706
- out = r.content
707
- self.cache = AllBytes(size=len(out), fetcher=None, blocksize=None, data=out)
708
- self.size = len(out)
709
-
710
- def _parse_content_range(self, headers):
711
- """Parse the Content-Range header"""
712
- s = headers.get("Content-Range", "")
713
- m = re.match(r"bytes (\d+-\d+|\*)/(\d+|\*)", s)
714
- if not m:
715
- return None, None, None
716
-
717
- if m[1] == "*":
718
- start = end = None
719
- else:
720
- start, end = [int(x) for x in m[1].split("-")]
721
- total = None if m[2] == "*" else int(m[2])
722
- return start, end, total
723
-
724
- def _fetch_range(self, start, end):
725
- """Download a block of data
726
-
727
- The expectation is that the server returns only the requested bytes,
728
- with HTTP code 206. If this is not the case, we first check the headers,
729
- and then stream the output - if the data size is bigger than we
730
- requested, an exception is raised.
731
- """
732
- logger.debug(f"Fetch range for {self}: {start}-{end}")
733
- kwargs = self.kwargs.copy()
734
- headers = kwargs.pop("headers", {}).copy()
735
- headers["Range"] = "bytes=%i-%i" % (start, end - 1)
736
- logger.debug(str(self.url) + " : " + headers["Range"])
737
- r = self.session.get(self.fs.encode_url(self.url), headers=headers, **kwargs)
738
- if r.status_code == 416:
739
- # range request outside file
740
- return b""
741
- r.raise_for_status()
742
-
743
- # If the server has handled the range request, it should reply
744
- # with status 206 (partial content). But we'll guess that a suitable
745
- # Content-Range header or a Content-Length no more than the
746
- # requested range also mean we have got the desired range.
747
- cl = r.headers.get("Content-Length", r.headers.get("content-length", end + 1))
748
- response_is_range = (
749
- r.status_code == 206
750
- or self._parse_content_range(r.headers)[0] == start
751
- or int(cl) <= end - start
752
- )
753
-
754
- if response_is_range:
755
- # partial content, as expected
756
- out = r.content
757
- elif start > 0:
758
- raise ValueError(
759
- "The HTTP server doesn't appear to support range requests. "
760
- "Only reading this file from the beginning is supported. "
761
- "Open with block_size=0 for a streaming file interface."
762
- )
763
- else:
764
- # Response is not a range, but we want the start of the file,
765
- # so we can read the required amount anyway.
766
- cl = 0
767
- out = []
768
- while True:
769
- r.raw.decode_content = True
770
- chunk = r.raw.read(2**20)
771
- # data size unknown, let's read until we have enough
772
- if chunk:
773
- out.append(chunk)
774
- cl += len(chunk)
775
- if cl > end - start:
776
- break
777
- else:
778
- break
779
- r.raw.close()
780
- out = b"".join(out)[: end - start]
781
- return out
782
-
783
-
784
- magic_check = re.compile("([*[])")
785
-
786
-
787
- def has_magic(s):
788
- match = magic_check.search(s)
789
- return match is not None
790
-
791
-
792
- class HTTPStreamFile(AbstractBufferedFile):
793
- def __init__(self, fs, url, mode="rb", session=None, **kwargs):
794
- self.url = url
795
- self.session = session
796
- if mode != "rb":
797
- raise ValueError
798
- self.details = {"name": url, "size": None}
799
- super().__init__(fs=fs, path=url, mode=mode, cache_type="readahead", **kwargs)
800
-
801
- r = self.session.get(self.fs.encode_url(url), stream=True, **kwargs)
802
- r.raw.decode_content = True
803
- self.fs._raise_not_found_for_status(r, url)
804
-
805
- self.r = r
806
-
807
- def seek(self, *args, **kwargs):
808
- raise ValueError("Cannot seek streaming HTTP file")
809
-
810
- def read(self, num=-1):
811
- bufs = []
812
- leng = 0
813
- while not self.r.raw.closed and (leng < num or num < 0):
814
- out = self.r.raw.read(num)
815
- if out:
816
- bufs.append(out)
817
- else:
818
- break
819
- leng += len(out)
820
- self.loc += leng
821
- return b"".join(bufs)
822
-
823
- def close(self):
824
- self.r.close()
825
-
826
-
827
- def get_range(session, url, start, end, **kwargs):
828
- # explicit get a range when we know it must be safe
829
- kwargs = kwargs.copy()
830
- headers = kwargs.pop("headers", {}).copy()
831
- headers["Range"] = "bytes=%i-%i" % (start, end - 1)
832
- r = session.get(url, headers=headers, **kwargs)
833
- r.raise_for_status()
834
- return r.content
835
-
836
-
837
- def _file_info(url, session, size_policy="head", **kwargs):
838
- """Call HEAD on the server to get details about the file (size/checksum etc.)
839
-
840
- Default operation is to explicitly allow redirects and use encoding
841
- 'identity' (no compression) to get the true size of the target.
842
- """
843
- logger.debug("Retrieve file size for %s" % url)
844
- kwargs = kwargs.copy()
845
- ar = kwargs.pop("allow_redirects", True)
846
- head = kwargs.get("headers", {}).copy()
847
- # TODO: not allowed in JS
848
- # head["Accept-Encoding"] = "identity"
849
- kwargs["headers"] = head
850
-
851
- info = {}
852
- if size_policy == "head":
853
- r = session.head(url, allow_redirects=ar, **kwargs)
854
- elif size_policy == "get":
855
- r = session.get(url, allow_redirects=ar, **kwargs)
856
- else:
857
- raise TypeError('size_policy must be "head" or "get", got %s' "" % size_policy)
858
- r.raise_for_status()
859
-
860
- # TODO:
861
- # recognise lack of 'Accept-Ranges',
862
- # or 'Accept-Ranges': 'none' (not 'bytes')
863
- # to mean streaming only, no random access => return None
864
- if "Content-Length" in r.headers:
865
- info["size"] = int(r.headers["Content-Length"])
866
- elif "Content-Range" in r.headers:
867
- info["size"] = int(r.headers["Content-Range"].split("/")[1])
868
- if "content-length" in r.headers:
869
- info["size"] = int(r.headers["content-length"])
870
- elif "content-range" in r.headers:
871
- info["size"] = int(r.headers["content-range"].split("/")[1])
872
-
873
- for checksum_field in ["ETag", "Content-MD5", "Digest"]:
874
- if r.headers.get(checksum_field):
875
- info[checksum_field] = r.headers[checksum_field]
876
-
877
- return info
878
-
879
-
880
- # importing this is enough to register it
881
- register_implementation("http", HTTPFileSystem, clobber=True)
882
- register_implementation("https", HTTPFileSystem, clobber=True)