cloud-files 5.5.0__tar.gz → 5.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cloud_files-5.5.0 → cloud_files-5.6.0}/.github/workflows/test-suite.yml +1 -1
- {cloud_files-5.5.0 → cloud_files-5.6.0}/ChangeLog +11 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/PKG-INFO +72 -2
- {cloud_files-5.5.0 → cloud_files-5.6.0}/README.md +64 -1
- {cloud_files-5.5.0 → cloud_files-5.6.0}/automated_test.py +121 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/PKG-INFO +72 -2
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/SOURCES.txt +1 -0
- cloud_files-5.6.0/cloud_files.egg-info/pbr.json +1 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/requires.txt +9 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/cloudfiles.py +245 -53
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/interfaces.py +55 -38
- cloud_files-5.6.0/cloudfiles/monitoring.py +724 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles_cli/cloudfiles_cli.py +87 -14
- {cloud_files-5.5.0 → cloud_files-5.6.0}/requirements.txt +1 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/setup.cfg +6 -0
- cloud_files-5.5.0/cloud_files.egg-info/pbr.json +0 -1
- {cloud_files-5.5.0 → cloud_files-5.6.0}/AUTHORS +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/LICENSE +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/MANIFEST.in +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/dependency_links.txt +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/entry_points.txt +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/not-zip-safe +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloud_files.egg-info/top_level.txt +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/__init__.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/compression.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/connectionpools.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/exceptions.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/gcs.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/lib.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/paths.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/resumable_tools.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/scheduler.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/secrets.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/test.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/threaded_queue.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles/typing.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles_cli/LICENSE +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/cloudfiles_cli/__init__.py +0 -0
- {cloud_files-5.5.0 → cloud_files-5.6.0}/setup.py +0 -0
|
@@ -26,7 +26,7 @@ jobs:
|
|
|
26
26
|
- name: Install dependencies
|
|
27
27
|
run: |
|
|
28
28
|
python -m pip install --upgrade pip
|
|
29
|
-
if [ -f requirements.txt ]; then pip install -e ".[test]"; fi
|
|
29
|
+
if [ -f requirements.txt ]; then pip install -e ".[test,monitoring]"; fi
|
|
30
30
|
- name: Test with pytest
|
|
31
31
|
run: |
|
|
32
32
|
python -m pytest -v -x automated_test.py
|
|
@@ -1,6 +1,17 @@
|
|
|
1
1
|
CHANGES
|
|
2
2
|
=======
|
|
3
3
|
|
|
4
|
+
5.6.0
|
|
5
|
+
-----
|
|
6
|
+
|
|
7
|
+
* docs: errata in how to use performance monitoring tools
|
|
8
|
+
* feat: performance monitoring (#117)
|
|
9
|
+
* docs: describe the cloudfiles constructor
|
|
10
|
+
* fix(cli): rm can interpret aliases properly
|
|
11
|
+
* fix: remove streaming for small s3 uploads
|
|
12
|
+
* docs: give more examples for CloudFile
|
|
13
|
+
* perf: aws-chunked was a bug in moto, that's now fixed. Stop adding copy\_object
|
|
14
|
+
|
|
4
15
|
5.5.0
|
|
5
16
|
-----
|
|
6
17
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cloud-files
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.6.0
|
|
4
4
|
Summary: Fast access to cloud storage and local FS.
|
|
5
5
|
Home-page: https://github.com/seung-lab/cloud-files/
|
|
6
6
|
Author: William Silversmith
|
|
@@ -30,6 +30,7 @@ Requires-Dist: google-auth>=1.10.0
|
|
|
30
30
|
Requires-Dist: google-cloud-core>=1.1.0
|
|
31
31
|
Requires-Dist: google-cloud-storage>=1.31.1
|
|
32
32
|
Requires-Dist: google-crc32c>=1.0.0
|
|
33
|
+
Requires-Dist: intervaltree
|
|
33
34
|
Requires-Dist: orjson
|
|
34
35
|
Requires-Dist: pathos
|
|
35
36
|
Requires-Dist: protobuf>=3.3.0
|
|
@@ -46,6 +47,12 @@ Requires-Dist: pytest; extra == "test"
|
|
|
46
47
|
Requires-Dist: moto>=5; extra == "test"
|
|
47
48
|
Provides-Extra: numpy
|
|
48
49
|
Requires-Dist: numpy; extra == "numpy"
|
|
50
|
+
Provides-Extra: monitoring
|
|
51
|
+
Requires-Dist: psutil; extra == "monitoring"
|
|
52
|
+
Requires-Dist: intervaltree; extra == "monitoring"
|
|
53
|
+
Requires-Dist: matplotlib; extra == "monitoring"
|
|
54
|
+
Provides-Extra: apache
|
|
55
|
+
Requires-Dist: lxml; extra == "apache"
|
|
49
56
|
|
|
50
57
|
[](https://badge.fury.io/py/cloud-files) [](https://github.com/seung-lab/cloud-files/actions?query=workflow%3A%22Test+Suite%22)
|
|
51
58
|
|
|
@@ -97,8 +104,20 @@ cf.touch([ "example", "example2" ])
|
|
|
97
104
|
cf = CloudFile("gs://bucket/file1")
|
|
98
105
|
info = cf.head()
|
|
99
106
|
binary = cf.get()
|
|
107
|
+
obj = cf.get_json()
|
|
100
108
|
cf.put(binary)
|
|
109
|
+
cf.put_json()
|
|
101
110
|
cf[:30] # get first 30 bytes of file
|
|
111
|
+
|
|
112
|
+
num_bytes = cf.size() # get size in bytes (also in head)
|
|
113
|
+
exists = cf.exists() # true or false
|
|
114
|
+
cf.delete() # deletes the file
|
|
115
|
+
cf.touch() # create the file if it doesn't exist
|
|
116
|
+
cf.move("gs://example/destination/directory") # copy then delete source
|
|
117
|
+
cf.transfer_from("gs://example/source/file.txt") # copies file efficiently
|
|
118
|
+
cf.transfer_to("gs://example/dest/file.txt") # copies file efficiently
|
|
119
|
+
|
|
120
|
+
path = cf.join([ path1, path2, path3 ]) # use the appropriate path separator
|
|
102
121
|
```
|
|
103
122
|
|
|
104
123
|
CloudFiles was developed to access files from object storage without ever touching disk. The goal was to reliably and rapidly access a petabyte of image data broken down into tens to hundreds of millions of files being accessed in parallel across thousands of cores. CloudFiles has been used to processes dozens of images, many of which were in the hundreds of terabyte range. It has reliably read and written tens of billions of files to date.
|
|
@@ -124,6 +143,7 @@ CloudFiles was developed to access files from object storage without ever touchi
|
|
|
124
143
|
```bash
|
|
125
144
|
pip install cloud-files
|
|
126
145
|
pip install cloud-files[test] # to enable testing with pytest
|
|
146
|
+
pip install cloud-files[monitoring] # enable plotting network performance
|
|
127
147
|
```
|
|
128
148
|
|
|
129
149
|
If you run into trouble installing dependenies, make sure you're using at least Python3.6 and you have updated pip. On Linux, some dependencies require manylinux2010 or manylinux2014 binaries which earlier versions of pip do not search for. MacOS, Linux, and Windows are supported platforms.
|
|
@@ -176,7 +196,9 @@ You can create the `google-secret.json` file [here](https://console.cloud.google
|
|
|
176
196
|
|
|
177
197
|
## API Documentation
|
|
178
198
|
|
|
179
|
-
Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
|
|
199
|
+
Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
|
|
200
|
+
|
|
201
|
+
`CloudFile` is a more intuitive version of `CloudFiles` designed for managing single files instead of groups of files. See examples above. There is an analogus method for each `CloudFiles` method (where it makes sense).
|
|
180
202
|
|
|
181
203
|
### Constructor
|
|
182
204
|
```python
|
|
@@ -236,6 +258,10 @@ binary = cf['filename'][0:5] # same result, fetches 11 bytes
|
|
|
236
258
|
>> b'hello' # represents byte range 0-4 inclusive of filename
|
|
237
259
|
|
|
238
260
|
binaries = cf[:100] # download the first 100 files
|
|
261
|
+
|
|
262
|
+
# Get the TransmissionMonitor object that records
|
|
263
|
+
# the flight time of each file.
|
|
264
|
+
binaries, tm = cf.get(..., return_recording=True)
|
|
239
265
|
```
|
|
240
266
|
|
|
241
267
|
`get` supports several different styles of input. The simplest takes a scalar filename and returns the contents of that file. However, you can also specify lists of filenames, a byte range request, and lists of byte range requests. You can provide a generator or iterator as input as well. Order is not guaranteed.
|
|
@@ -265,6 +291,10 @@ cf.puts([{
|
|
|
265
291
|
cf.puts([ (path, content), (path, content) ], compression='gzip')
|
|
266
292
|
cf.put_jsons(...)
|
|
267
293
|
|
|
294
|
+
# Get the TransmissionMonitor object that records the
|
|
295
|
+
# flight times of each object.
|
|
296
|
+
_, tm = cf.puts(..., return_recording=True)
|
|
297
|
+
|
|
268
298
|
# Definition of put, put_json is identical
|
|
269
299
|
def put(
|
|
270
300
|
self,
|
|
@@ -469,6 +499,12 @@ cloudfiles -p 2 cp --progress -r s3://bkt/ gs://bkt2/
|
|
|
469
499
|
cloudfiles cp -c br s3://bkt/file.txt gs://bkt2/
|
|
470
500
|
# decompress
|
|
471
501
|
cloudfiles cp -c none s3://bkt/file.txt gs://bkt2/
|
|
502
|
+
# save chart of file flight times
|
|
503
|
+
cloudfiles cp --flight-time s3://bkt/file.txt gs://bkt2/
|
|
504
|
+
# save a chart of estimated bandwidth usage from these files alone
|
|
505
|
+
cloudfiles cp --io-rate s3://bkt/file.txt gs://bkt2/
|
|
506
|
+
# save a chart of measured bandwidth usage for the machine
|
|
507
|
+
cloudfiles cp --machine-io-rate s3://bkt/file.txt gs://bkt2/
|
|
472
508
|
# move or rename files
|
|
473
509
|
cloudfiles mv s3://bkt/file.txt gs://bkt2/
|
|
474
510
|
# create an empty file if not existing
|
|
@@ -528,6 +564,40 @@ cloudfiles alias rm example # remove example://
|
|
|
528
564
|
|
|
529
565
|
The alias file is only accessed (and cached) if CloudFiles encounters an unknown protocol. If you stick to default protocols and use the syntax `s3://https://example.com/` for alternative endpoints, you can still use CloudFiles in environments without filesystem access.
|
|
530
566
|
|
|
567
|
+
## Performance Monitoring
|
|
568
|
+
|
|
569
|
+
CloudFiles now comes with two tools inside of `cloudfiles.monitoring` for measuring the performance of transfer operations both via the CLI and the programatic interface.
|
|
570
|
+
|
|
571
|
+
A `TransmissionMonitor` object is created during each download or upload (e.g. `cf.get` or `cf.puts`) call. You can access this object by using the `return_recording=True` flag. This object saves the flight times of each object along with its size in an interval tree datastructure. It comes with methods for estimating the peak bits per a second and can plot both time of flight and the estimated transfer rates (assuming the transfer is evenly divided over the flight of an object, an assumption that is not always true). This allows you to estimate the contribution of a given CloudFiles operation to a machine's network IO.
|
|
572
|
+
|
|
573
|
+
```python
|
|
574
|
+
from cloudfiles import CloudFiles
|
|
575
|
+
|
|
576
|
+
...
|
|
577
|
+
|
|
578
|
+
results, tm = cf.get([ ... some files ... ], return_recording=True)
|
|
579
|
+
|
|
580
|
+
value = tm.peak_Mbps() # estimated peak transfer rate
|
|
581
|
+
value = tm.total_Mbps() # estimated average transfer rate
|
|
582
|
+
tm.plot_gantt() # time of flight chart
|
|
583
|
+
tm.plot_histogram() # transfer rate chart
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
A second object, `IOSampler`, can sample the OS network counters using a background thread and provides a global view of the machine's network performance during the life of the transfer. It is enabled on the CLI for the `cp` command when the `--machine-io-rate` flag is enabled, but must be manually started programatically. This is to avoid accidentally starting unnecessary sampling threads. The samples are accumulated into a circular buffer, so make sure to set the buffer length long enough for your points of interest to be captured.
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
from cloudfiles.monitoring import IOSampler
|
|
590
|
+
|
|
591
|
+
sampler = IOSampler(buffer_sec=600, interval=0.25)
|
|
592
|
+
sampler.start_sampling()
|
|
593
|
+
|
|
594
|
+
...
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
sampler.stop_sampling()
|
|
598
|
+
sampler.plot_histogram()
|
|
599
|
+
```
|
|
600
|
+
|
|
531
601
|
## Credits
|
|
532
602
|
|
|
533
603
|
CloudFiles is derived from the [CloudVolume.Storage](https://github.com/seung-lab/cloud-volume/tree/master/cloudvolume/storage) system.
|
|
@@ -48,8 +48,20 @@ cf.touch([ "example", "example2" ])
|
|
|
48
48
|
cf = CloudFile("gs://bucket/file1")
|
|
49
49
|
info = cf.head()
|
|
50
50
|
binary = cf.get()
|
|
51
|
+
obj = cf.get_json()
|
|
51
52
|
cf.put(binary)
|
|
53
|
+
cf.put_json()
|
|
52
54
|
cf[:30] # get first 30 bytes of file
|
|
55
|
+
|
|
56
|
+
num_bytes = cf.size() # get size in bytes (also in head)
|
|
57
|
+
exists = cf.exists() # true or false
|
|
58
|
+
cf.delete() # deletes the file
|
|
59
|
+
cf.touch() # create the file if it doesn't exist
|
|
60
|
+
cf.move("gs://example/destination/directory") # copy then delete source
|
|
61
|
+
cf.transfer_from("gs://example/source/file.txt") # copies file efficiently
|
|
62
|
+
cf.transfer_to("gs://example/dest/file.txt") # copies file efficiently
|
|
63
|
+
|
|
64
|
+
path = cf.join([ path1, path2, path3 ]) # use the appropriate path separator
|
|
53
65
|
```
|
|
54
66
|
|
|
55
67
|
CloudFiles was developed to access files from object storage without ever touching disk. The goal was to reliably and rapidly access a petabyte of image data broken down into tens to hundreds of millions of files being accessed in parallel across thousands of cores. CloudFiles has been used to processes dozens of images, many of which were in the hundreds of terabyte range. It has reliably read and written tens of billions of files to date.
|
|
@@ -75,6 +87,7 @@ CloudFiles was developed to access files from object storage without ever touchi
|
|
|
75
87
|
```bash
|
|
76
88
|
pip install cloud-files
|
|
77
89
|
pip install cloud-files[test] # to enable testing with pytest
|
|
90
|
+
pip install cloud-files[monitoring] # enable plotting network performance
|
|
78
91
|
```
|
|
79
92
|
|
|
80
93
|
If you run into trouble installing dependenies, make sure you're using at least Python3.6 and you have updated pip. On Linux, some dependencies require manylinux2010 or manylinux2014 binaries which earlier versions of pip do not search for. MacOS, Linux, and Windows are supported platforms.
|
|
@@ -127,7 +140,9 @@ You can create the `google-secret.json` file [here](https://console.cloud.google
|
|
|
127
140
|
|
|
128
141
|
## API Documentation
|
|
129
142
|
|
|
130
|
-
Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
|
|
143
|
+
Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
|
|
144
|
+
|
|
145
|
+
`CloudFile` is a more intuitive version of `CloudFiles` designed for managing single files instead of groups of files. See examples above. There is an analogus method for each `CloudFiles` method (where it makes sense).
|
|
131
146
|
|
|
132
147
|
### Constructor
|
|
133
148
|
```python
|
|
@@ -187,6 +202,10 @@ binary = cf['filename'][0:5] # same result, fetches 11 bytes
|
|
|
187
202
|
>> b'hello' # represents byte range 0-4 inclusive of filename
|
|
188
203
|
|
|
189
204
|
binaries = cf[:100] # download the first 100 files
|
|
205
|
+
|
|
206
|
+
# Get the TransmissionMonitor object that records
|
|
207
|
+
# the flight time of each file.
|
|
208
|
+
binaries, tm = cf.get(..., return_recording=True)
|
|
190
209
|
```
|
|
191
210
|
|
|
192
211
|
`get` supports several different styles of input. The simplest takes a scalar filename and returns the contents of that file. However, you can also specify lists of filenames, a byte range request, and lists of byte range requests. You can provide a generator or iterator as input as well. Order is not guaranteed.
|
|
@@ -216,6 +235,10 @@ cf.puts([{
|
|
|
216
235
|
cf.puts([ (path, content), (path, content) ], compression='gzip')
|
|
217
236
|
cf.put_jsons(...)
|
|
218
237
|
|
|
238
|
+
# Get the TransmissionMonitor object that records the
|
|
239
|
+
# flight times of each object.
|
|
240
|
+
_, tm = cf.puts(..., return_recording=True)
|
|
241
|
+
|
|
219
242
|
# Definition of put, put_json is identical
|
|
220
243
|
def put(
|
|
221
244
|
self,
|
|
@@ -420,6 +443,12 @@ cloudfiles -p 2 cp --progress -r s3://bkt/ gs://bkt2/
|
|
|
420
443
|
cloudfiles cp -c br s3://bkt/file.txt gs://bkt2/
|
|
421
444
|
# decompress
|
|
422
445
|
cloudfiles cp -c none s3://bkt/file.txt gs://bkt2/
|
|
446
|
+
# save chart of file flight times
|
|
447
|
+
cloudfiles cp --flight-time s3://bkt/file.txt gs://bkt2/
|
|
448
|
+
# save a chart of estimated bandwidth usage from these files alone
|
|
449
|
+
cloudfiles cp --io-rate s3://bkt/file.txt gs://bkt2/
|
|
450
|
+
# save a chart of measured bandwidth usage for the machine
|
|
451
|
+
cloudfiles cp --machine-io-rate s3://bkt/file.txt gs://bkt2/
|
|
423
452
|
# move or rename files
|
|
424
453
|
cloudfiles mv s3://bkt/file.txt gs://bkt2/
|
|
425
454
|
# create an empty file if not existing
|
|
@@ -479,6 +508,40 @@ cloudfiles alias rm example # remove example://
|
|
|
479
508
|
|
|
480
509
|
The alias file is only accessed (and cached) if CloudFiles encounters an unknown protocol. If you stick to default protocols and use the syntax `s3://https://example.com/` for alternative endpoints, you can still use CloudFiles in environments without filesystem access.
|
|
481
510
|
|
|
511
|
+
## Performance Monitoring
|
|
512
|
+
|
|
513
|
+
CloudFiles now comes with two tools inside of `cloudfiles.monitoring` for measuring the performance of transfer operations both via the CLI and the programatic interface.
|
|
514
|
+
|
|
515
|
+
A `TransmissionMonitor` object is created during each download or upload (e.g. `cf.get` or `cf.puts`) call. You can access this object by using the `return_recording=True` flag. This object saves the flight times of each object along with its size in an interval tree datastructure. It comes with methods for estimating the peak bits per a second and can plot both time of flight and the estimated transfer rates (assuming the transfer is evenly divided over the flight of an object, an assumption that is not always true). This allows you to estimate the contribution of a given CloudFiles operation to a machine's network IO.
|
|
516
|
+
|
|
517
|
+
```python
|
|
518
|
+
from cloudfiles import CloudFiles
|
|
519
|
+
|
|
520
|
+
...
|
|
521
|
+
|
|
522
|
+
results, tm = cf.get([ ... some files ... ], return_recording=True)
|
|
523
|
+
|
|
524
|
+
value = tm.peak_Mbps() # estimated peak transfer rate
|
|
525
|
+
value = tm.total_Mbps() # estimated average transfer rate
|
|
526
|
+
tm.plot_gantt() # time of flight chart
|
|
527
|
+
tm.plot_histogram() # transfer rate chart
|
|
528
|
+
```
|
|
529
|
+
|
|
530
|
+
A second object, `IOSampler`, can sample the OS network counters using a background thread and provides a global view of the machine's network performance during the life of the transfer. It is enabled on the CLI for the `cp` command when the `--machine-io-rate` flag is enabled, but must be manually started programatically. This is to avoid accidentally starting unnecessary sampling threads. The samples are accumulated into a circular buffer, so make sure to set the buffer length long enough for your points of interest to be captured.
|
|
531
|
+
|
|
532
|
+
```python
|
|
533
|
+
from cloudfiles.monitoring import IOSampler
|
|
534
|
+
|
|
535
|
+
sampler = IOSampler(buffer_sec=600, interval=0.25)
|
|
536
|
+
sampler.start_sampling()
|
|
537
|
+
|
|
538
|
+
...
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
sampler.stop_sampling()
|
|
542
|
+
sampler.plot_histogram()
|
|
543
|
+
```
|
|
544
|
+
|
|
482
545
|
## Credits
|
|
483
546
|
|
|
484
547
|
CloudFiles is derived from the [CloudVolume.Storage](https://github.com/seung-lab/cloud-volume/tree/master/cloudvolume/storage) system.
|
|
@@ -4,6 +4,13 @@ import re
|
|
|
4
4
|
import shutil
|
|
5
5
|
import time
|
|
6
6
|
|
|
7
|
+
|
|
8
|
+
import uuid
|
|
9
|
+
from typing import Optional
|
|
10
|
+
from unittest.mock import patch, MagicMock
|
|
11
|
+
import numpy as np
|
|
12
|
+
from cloudfiles.monitoring import TransmissionMonitor, IOSampler, IOEnum
|
|
13
|
+
|
|
7
14
|
from moto import mock_aws
|
|
8
15
|
|
|
9
16
|
COMPRESSION_TYPES = [
|
|
@@ -1259,3 +1266,117 @@ def test_touch(s3, protocol):
|
|
|
1259
1266
|
cf.touch([ str(i) for i in range(20) ])
|
|
1260
1267
|
|
|
1261
1268
|
assert sorted(list(cf)) == sorted([ str(i) for i in range(20) ])
|
|
1269
|
+
|
|
1270
|
+
class TestTransmissionMonitor:
|
|
1271
|
+
@pytest.fixture
|
|
1272
|
+
def tx_monitor(self):
|
|
1273
|
+
return TransmissionMonitor(IOEnum.TX)
|
|
1274
|
+
|
|
1275
|
+
@pytest.fixture
|
|
1276
|
+
def rx_monitor(self):
|
|
1277
|
+
return TransmissionMonitor(IOEnum.RX)
|
|
1278
|
+
|
|
1279
|
+
def test_init(self, tx_monitor):
|
|
1280
|
+
assert tx_monitor._direction == IOEnum.TX
|
|
1281
|
+
assert tx_monitor._total_bytes_landed == 0
|
|
1282
|
+
assert len(tx_monitor._in_flight) == 0
|
|
1283
|
+
assert len(tx_monitor._errors) == 0
|
|
1284
|
+
assert tx_monitor._in_flight_bytes == 0
|
|
1285
|
+
|
|
1286
|
+
def test_start_io(self, tx_monitor):
|
|
1287
|
+
flight_id = tx_monitor.start_io(100)
|
|
1288
|
+
assert isinstance(flight_id, uuid.UUID)
|
|
1289
|
+
assert len(tx_monitor._in_flight) == 1
|
|
1290
|
+
assert tx_monitor._in_flight_bytes == 100
|
|
1291
|
+
|
|
1292
|
+
def test_end_io(self, tx_monitor):
|
|
1293
|
+
flight_id = tx_monitor.start_io(100)
|
|
1294
|
+
time.sleep(0.01)
|
|
1295
|
+
tx_monitor.end_io(flight_id, 100)
|
|
1296
|
+
|
|
1297
|
+
assert len(tx_monitor._in_flight) == 0
|
|
1298
|
+
assert tx_monitor._in_flight_bytes == 0
|
|
1299
|
+
assert tx_monitor._total_bytes_landed == 100
|
|
1300
|
+
assert len(tx_monitor._intervaltree) == 1
|
|
1301
|
+
|
|
1302
|
+
def test_end_error(self, tx_monitor):
|
|
1303
|
+
flight_id = tx_monitor.start_io(100)
|
|
1304
|
+
tx_monitor.end_error(flight_id)
|
|
1305
|
+
|
|
1306
|
+
assert flight_id in tx_monitor._errors
|
|
1307
|
+
assert len(tx_monitor._in_flight) == 1 # Still in flight
|
|
1308
|
+
|
|
1309
|
+
def test_total_bytes(self, tx_monitor):
|
|
1310
|
+
flight_id1 = tx_monitor.start_io(100)
|
|
1311
|
+
flight_id2 = tx_monitor.start_io(200)
|
|
1312
|
+
tx_monitor.end_io(flight_id1, 100)
|
|
1313
|
+
tx_monitor.end_io(flight_id2, 200)
|
|
1314
|
+
|
|
1315
|
+
assert tx_monitor.total_bytes() == 300
|
|
1316
|
+
|
|
1317
|
+
def test_total_bps(self, tx_monitor):
|
|
1318
|
+
flight_id = tx_monitor.start_io(100)
|
|
1319
|
+
time.sleep(1.0)
|
|
1320
|
+
tx_monitor.end_io(flight_id, 100)
|
|
1321
|
+
|
|
1322
|
+
bps = tx_monitor.total_bps()
|
|
1323
|
+
assert pytest.approx(bps, rel=0.1) == 800 # 100 bytes * 8 bits/byte / 1 sec
|
|
1324
|
+
|
|
1325
|
+
def test_current_bps(self, tx_monitor):
|
|
1326
|
+
# Test with lookback window
|
|
1327
|
+
flight_id = tx_monitor.start_io(100)
|
|
1328
|
+
time.sleep(1.0)
|
|
1329
|
+
tx_monitor.end_io(flight_id, 100)
|
|
1330
|
+
|
|
1331
|
+
bps = tx_monitor.current_bps(look_back_sec=0.5)
|
|
1332
|
+
assert bps > 0
|
|
1333
|
+
|
|
1334
|
+
def test_peak_bps(self, tx_monitor):
|
|
1335
|
+
flight_id1 = tx_monitor.start_io(100)
|
|
1336
|
+
time.sleep(0.5)
|
|
1337
|
+
tx_monitor.end_io(flight_id1, 100)
|
|
1338
|
+
|
|
1339
|
+
flight_id2 = tx_monitor.start_io(200)
|
|
1340
|
+
time.sleep(0.5)
|
|
1341
|
+
tx_monitor.end_io(flight_id2, 200)
|
|
1342
|
+
|
|
1343
|
+
peak = tx_monitor.peak_bps()
|
|
1344
|
+
assert peak > 0
|
|
1345
|
+
|
|
1346
|
+
def test_histogram(self, tx_monitor):
|
|
1347
|
+
flight_id = tx_monitor.start_io(100)
|
|
1348
|
+
time.sleep(1.0)
|
|
1349
|
+
tx_monitor.end_io(flight_id, 100)
|
|
1350
|
+
|
|
1351
|
+
hist = tx_monitor.histogram(resolution=1.0)
|
|
1352
|
+
assert len(hist) > 0
|
|
1353
|
+
assert np.sum(hist) == 100
|
|
1354
|
+
|
|
1355
|
+
def test_merge(self, tx_monitor):
|
|
1356
|
+
monitor1 = TransmissionMonitor(IOEnum.TX)
|
|
1357
|
+
monitor2 = TransmissionMonitor(IOEnum.TX)
|
|
1358
|
+
|
|
1359
|
+
flight_id1 = monitor1.start_io(100)
|
|
1360
|
+
time.sleep(0.1)
|
|
1361
|
+
monitor1.end_io(flight_id1, 100)
|
|
1362
|
+
|
|
1363
|
+
flight_id2 = monitor2.start_io(200)
|
|
1364
|
+
time.sleep(0.1)
|
|
1365
|
+
monitor2.end_io(flight_id2, 200)
|
|
1366
|
+
|
|
1367
|
+
merged = TransmissionMonitor.merge([monitor1, monitor2])
|
|
1368
|
+
|
|
1369
|
+
assert merged.total_bytes() == 300
|
|
1370
|
+
assert len(merged._intervaltree) == 2
|
|
1371
|
+
|
|
1372
|
+
def test_serialization(self, tx_monitor):
|
|
1373
|
+
flight_id = tx_monitor.start_io(100)
|
|
1374
|
+
time.sleep(0.1)
|
|
1375
|
+
tx_monitor.end_io(flight_id, 100)
|
|
1376
|
+
|
|
1377
|
+
import pickle
|
|
1378
|
+
data = pickle.dumps(tx_monitor)
|
|
1379
|
+
new_monitor = pickle.loads(data)
|
|
1380
|
+
|
|
1381
|
+
assert new_monitor.total_bytes() == 100
|
|
1382
|
+
assert hasattr(new_monitor, '_lock')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: cloud-files
|
|
3
|
-
Version: 5.
|
|
3
|
+
Version: 5.6.0
|
|
4
4
|
Summary: Fast access to cloud storage and local FS.
|
|
5
5
|
Home-page: https://github.com/seung-lab/cloud-files/
|
|
6
6
|
Author: William Silversmith
|
|
@@ -30,6 +30,7 @@ Requires-Dist: google-auth>=1.10.0
|
|
|
30
30
|
Requires-Dist: google-cloud-core>=1.1.0
|
|
31
31
|
Requires-Dist: google-cloud-storage>=1.31.1
|
|
32
32
|
Requires-Dist: google-crc32c>=1.0.0
|
|
33
|
+
Requires-Dist: intervaltree
|
|
33
34
|
Requires-Dist: orjson
|
|
34
35
|
Requires-Dist: pathos
|
|
35
36
|
Requires-Dist: protobuf>=3.3.0
|
|
@@ -46,6 +47,12 @@ Requires-Dist: pytest; extra == "test"
|
|
|
46
47
|
Requires-Dist: moto>=5; extra == "test"
|
|
47
48
|
Provides-Extra: numpy
|
|
48
49
|
Requires-Dist: numpy; extra == "numpy"
|
|
50
|
+
Provides-Extra: monitoring
|
|
51
|
+
Requires-Dist: psutil; extra == "monitoring"
|
|
52
|
+
Requires-Dist: intervaltree; extra == "monitoring"
|
|
53
|
+
Requires-Dist: matplotlib; extra == "monitoring"
|
|
54
|
+
Provides-Extra: apache
|
|
55
|
+
Requires-Dist: lxml; extra == "apache"
|
|
49
56
|
|
|
50
57
|
[](https://badge.fury.io/py/cloud-files) [](https://github.com/seung-lab/cloud-files/actions?query=workflow%3A%22Test+Suite%22)
|
|
51
58
|
|
|
@@ -97,8 +104,20 @@ cf.touch([ "example", "example2" ])
|
|
|
97
104
|
cf = CloudFile("gs://bucket/file1")
|
|
98
105
|
info = cf.head()
|
|
99
106
|
binary = cf.get()
|
|
107
|
+
obj = cf.get_json()
|
|
100
108
|
cf.put(binary)
|
|
109
|
+
cf.put_json()
|
|
101
110
|
cf[:30] # get first 30 bytes of file
|
|
111
|
+
|
|
112
|
+
num_bytes = cf.size() # get size in bytes (also in head)
|
|
113
|
+
exists = cf.exists() # true or false
|
|
114
|
+
cf.delete() # deletes the file
|
|
115
|
+
cf.touch() # create the file if it doesn't exist
|
|
116
|
+
cf.move("gs://example/destination/directory") # copy then delete source
|
|
117
|
+
cf.transfer_from("gs://example/source/file.txt") # copies file efficiently
|
|
118
|
+
cf.transfer_to("gs://example/dest/file.txt") # copies file efficiently
|
|
119
|
+
|
|
120
|
+
path = cf.join([ path1, path2, path3 ]) # use the appropriate path separator
|
|
102
121
|
```
|
|
103
122
|
|
|
104
123
|
CloudFiles was developed to access files from object storage without ever touching disk. The goal was to reliably and rapidly access a petabyte of image data broken down into tens to hundreds of millions of files being accessed in parallel across thousands of cores. CloudFiles has been used to processes dozens of images, many of which were in the hundreds of terabyte range. It has reliably read and written tens of billions of files to date.
|
|
@@ -124,6 +143,7 @@ CloudFiles was developed to access files from object storage without ever touchi
|
|
|
124
143
|
```bash
|
|
125
144
|
pip install cloud-files
|
|
126
145
|
pip install cloud-files[test] # to enable testing with pytest
|
|
146
|
+
pip install cloud-files[monitoring] # enable plotting network performance
|
|
127
147
|
```
|
|
128
148
|
|
|
129
149
|
If you run into trouble installing dependenies, make sure you're using at least Python3.6 and you have updated pip. On Linux, some dependencies require manylinux2010 or manylinux2014 binaries which earlier versions of pip do not search for. MacOS, Linux, and Windows are supported platforms.
|
|
@@ -176,7 +196,9 @@ You can create the `google-secret.json` file [here](https://console.cloud.google
|
|
|
176
196
|
|
|
177
197
|
## API Documentation
|
|
178
198
|
|
|
179
|
-
Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
|
|
199
|
+
Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
|
|
200
|
+
|
|
201
|
+
`CloudFile` is a more intuitive version of `CloudFiles` designed for managing single files instead of groups of files. See examples above. There is an analogus method for each `CloudFiles` method (where it makes sense).
|
|
180
202
|
|
|
181
203
|
### Constructor
|
|
182
204
|
```python
|
|
@@ -236,6 +258,10 @@ binary = cf['filename'][0:5] # same result, fetches 11 bytes
|
|
|
236
258
|
>> b'hello' # represents byte range 0-4 inclusive of filename
|
|
237
259
|
|
|
238
260
|
binaries = cf[:100] # download the first 100 files
|
|
261
|
+
|
|
262
|
+
# Get the TransmissionMonitor object that records
|
|
263
|
+
# the flight time of each file.
|
|
264
|
+
binaries, tm = cf.get(..., return_recording=True)
|
|
239
265
|
```
|
|
240
266
|
|
|
241
267
|
`get` supports several different styles of input. The simplest takes a scalar filename and returns the contents of that file. However, you can also specify lists of filenames, a byte range request, and lists of byte range requests. You can provide a generator or iterator as input as well. Order is not guaranteed.
|
|
@@ -265,6 +291,10 @@ cf.puts([{
|
|
|
265
291
|
cf.puts([ (path, content), (path, content) ], compression='gzip')
|
|
266
292
|
cf.put_jsons(...)
|
|
267
293
|
|
|
294
|
+
# Get the TransmissionMonitor object that records the
|
|
295
|
+
# flight times of each object.
|
|
296
|
+
_, tm = cf.puts(..., return_recording=True)
|
|
297
|
+
|
|
268
298
|
# Definition of put, put_json is identical
|
|
269
299
|
def put(
|
|
270
300
|
self,
|
|
@@ -469,6 +499,12 @@ cloudfiles -p 2 cp --progress -r s3://bkt/ gs://bkt2/
|
|
|
469
499
|
cloudfiles cp -c br s3://bkt/file.txt gs://bkt2/
|
|
470
500
|
# decompress
|
|
471
501
|
cloudfiles cp -c none s3://bkt/file.txt gs://bkt2/
|
|
502
|
+
# save chart of file flight times
|
|
503
|
+
cloudfiles cp --flight-time s3://bkt/file.txt gs://bkt2/
|
|
504
|
+
# save a chart of estimated bandwidth usage from these files alone
|
|
505
|
+
cloudfiles cp --io-rate s3://bkt/file.txt gs://bkt2/
|
|
506
|
+
# save a chart of measured bandwidth usage for the machine
|
|
507
|
+
cloudfiles cp --machine-io-rate s3://bkt/file.txt gs://bkt2/
|
|
472
508
|
# move or rename files
|
|
473
509
|
cloudfiles mv s3://bkt/file.txt gs://bkt2/
|
|
474
510
|
# create an empty file if not existing
|
|
@@ -528,6 +564,40 @@ cloudfiles alias rm example # remove example://
|
|
|
528
564
|
|
|
529
565
|
The alias file is only accessed (and cached) if CloudFiles encounters an unknown protocol. If you stick to default protocols and use the syntax `s3://https://example.com/` for alternative endpoints, you can still use CloudFiles in environments without filesystem access.
|
|
530
566
|
|
|
567
|
+
## Performance Monitoring
|
|
568
|
+
|
|
569
|
+
CloudFiles now comes with two tools inside of `cloudfiles.monitoring` for measuring the performance of transfer operations both via the CLI and the programatic interface.
|
|
570
|
+
|
|
571
|
+
A `TransmissionMonitor` object is created during each download or upload (e.g. `cf.get` or `cf.puts`) call. You can access this object by using the `return_recording=True` flag. This object saves the flight times of each object along with its size in an interval tree datastructure. It comes with methods for estimating the peak bits per a second and can plot both time of flight and the estimated transfer rates (assuming the transfer is evenly divided over the flight of an object, an assumption that is not always true). This allows you to estimate the contribution of a given CloudFiles operation to a machine's network IO.
|
|
572
|
+
|
|
573
|
+
```python
|
|
574
|
+
from cloudfiles import CloudFiles
|
|
575
|
+
|
|
576
|
+
...
|
|
577
|
+
|
|
578
|
+
results, tm = cf.get([ ... some files ... ], return_recording=True)
|
|
579
|
+
|
|
580
|
+
value = tm.peak_Mbps() # estimated peak transfer rate
|
|
581
|
+
value = tm.total_Mbps() # estimated average transfer rate
|
|
582
|
+
tm.plot_gantt() # time of flight chart
|
|
583
|
+
tm.plot_histogram() # transfer rate chart
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
A second object, `IOSampler`, can sample the OS network counters using a background thread and provides a global view of the machine's network performance during the life of the transfer. It is enabled on the CLI for the `cp` command when the `--machine-io-rate` flag is enabled, but must be manually started programatically. This is to avoid accidentally starting unnecessary sampling threads. The samples are accumulated into a circular buffer, so make sure to set the buffer length long enough for your points of interest to be captured.
|
|
587
|
+
|
|
588
|
+
```python
|
|
589
|
+
from cloudfiles.monitoring import IOSampler
|
|
590
|
+
|
|
591
|
+
sampler = IOSampler(buffer_sec=600, interval=0.25)
|
|
592
|
+
sampler.start_sampling()
|
|
593
|
+
|
|
594
|
+
...
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
sampler.stop_sampling()
|
|
598
|
+
sampler.plot_histogram()
|
|
599
|
+
```
|
|
600
|
+
|
|
531
601
|
## Credits
|
|
532
602
|
|
|
533
603
|
CloudFiles is derived from the [CloudVolume.Storage](https://github.com/seung-lab/cloud-volume/tree/master/cloudvolume/storage) system.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"git_version": "4a9dd39", "is_release": true}
|
|
@@ -9,6 +9,7 @@ google-auth>=1.10.0
|
|
|
9
9
|
google-cloud-core>=1.1.0
|
|
10
10
|
google-cloud-storage>=1.31.1
|
|
11
11
|
google-crc32c>=1.0.0
|
|
12
|
+
intervaltree
|
|
12
13
|
orjson
|
|
13
14
|
pathos
|
|
14
15
|
protobuf>=3.3.0
|
|
@@ -21,6 +22,14 @@ zstandard
|
|
|
21
22
|
rsa>=4.7.2
|
|
22
23
|
fasteners
|
|
23
24
|
|
|
25
|
+
[apache]
|
|
26
|
+
lxml
|
|
27
|
+
|
|
28
|
+
[monitoring]
|
|
29
|
+
psutil
|
|
30
|
+
intervaltree
|
|
31
|
+
matplotlib
|
|
32
|
+
|
|
24
33
|
[numpy]
|
|
25
34
|
numpy
|
|
26
35
|
|