cloud-files 5.4.1__tar.gz → 5.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {cloud_files-5.4.1 → cloud_files-5.6.0}/.github/workflows/test-suite.yml +1 -1
  2. {cloud_files-5.4.1 → cloud_files-5.6.0}/ChangeLog +14 -3
  3. {cloud_files-5.4.1 → cloud_files-5.6.0}/PKG-INFO +72 -2
  4. {cloud_files-5.4.1 → cloud_files-5.6.0}/README.md +64 -1
  5. {cloud_files-5.4.1 → cloud_files-5.6.0}/automated_test.py +121 -0
  6. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/PKG-INFO +72 -2
  7. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/SOURCES.txt +1 -0
  8. cloud_files-5.6.0/cloud_files.egg-info/pbr.json +1 -0
  9. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/requires.txt +9 -0
  10. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/cloudfiles.py +245 -53
  11. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/interfaces.py +55 -38
  12. cloud_files-5.6.0/cloudfiles/monitoring.py +724 -0
  13. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/scheduler.py +6 -1
  14. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles_cli/cloudfiles_cli.py +87 -14
  15. {cloud_files-5.4.1 → cloud_files-5.6.0}/requirements.txt +1 -0
  16. {cloud_files-5.4.1 → cloud_files-5.6.0}/setup.cfg +6 -0
  17. cloud_files-5.4.1/cloud_files.egg-info/pbr.json +0 -1
  18. {cloud_files-5.4.1 → cloud_files-5.6.0}/AUTHORS +0 -0
  19. {cloud_files-5.4.1 → cloud_files-5.6.0}/LICENSE +0 -0
  20. {cloud_files-5.4.1 → cloud_files-5.6.0}/MANIFEST.in +0 -0
  21. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/dependency_links.txt +0 -0
  22. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/entry_points.txt +0 -0
  23. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/not-zip-safe +0 -0
  24. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloud_files.egg-info/top_level.txt +0 -0
  25. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/__init__.py +0 -0
  26. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/compression.py +0 -0
  27. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/connectionpools.py +0 -0
  28. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/exceptions.py +0 -0
  29. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/gcs.py +0 -0
  30. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/lib.py +0 -0
  31. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/paths.py +0 -0
  32. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/resumable_tools.py +0 -0
  33. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/secrets.py +0 -0
  34. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/test.py +0 -0
  35. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/threaded_queue.py +0 -0
  36. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles/typing.py +0 -0
  37. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles_cli/LICENSE +0 -0
  38. {cloud_files-5.4.1 → cloud_files-5.6.0}/cloudfiles_cli/__init__.py +0 -0
  39. {cloud_files-5.4.1 → cloud_files-5.6.0}/setup.py +0 -0
@@ -26,7 +26,7 @@ jobs:
26
26
  - name: Install dependencies
27
27
  run: |
28
28
  python -m pip install --upgrade pip
29
- if [ -f requirements.txt ]; then pip install -e ".[test]"; fi
29
+ if [ -f requirements.txt ]; then pip install -e ".[test,monitoring]"; fi
30
30
  - name: Test with pytest
31
31
  run: |
32
32
  python -m pytest -v -x automated_test.py
@@ -1,11 +1,22 @@
1
1
  CHANGES
2
2
  =======
3
3
 
4
- 5.4.1
4
+ 5.6.0
5
5
  -----
6
6
 
7
- * fix: enable CAVE to accept secret kw and alternative server credentials
8
- * fix(cave): ensure CAVE tokens are handled properly
7
+ * docs: errata in how to use performance monitoring tools
8
+ * feat: performance monitoring (#117)
9
+ * docs: describe the cloudfiles constructor
10
+ * fix(cli): rm can interpret aliases properly
11
+ * fix: remove streaming for small s3 uploads
12
+ * docs: give more examples for CloudFile
13
+ * perf: aws-chunked was a bug in moto, that's now fixed. Stop adding copy\_object
14
+
15
+ 5.5.0
16
+ -----
17
+
18
+ * perf: use fewer threads if there are fewer things to upload (#116)
19
+ * fix(cave): expanded credentials methods (#114)
9
20
  * fix: isdir was inverted for http
10
21
  * chore: update changelog
11
22
  * fix(cli): incorrect escaping
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.4.1
3
+ Version: 5.6.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -30,6 +30,7 @@ Requires-Dist: google-auth>=1.10.0
30
30
  Requires-Dist: google-cloud-core>=1.1.0
31
31
  Requires-Dist: google-cloud-storage>=1.31.1
32
32
  Requires-Dist: google-crc32c>=1.0.0
33
+ Requires-Dist: intervaltree
33
34
  Requires-Dist: orjson
34
35
  Requires-Dist: pathos
35
36
  Requires-Dist: protobuf>=3.3.0
@@ -46,6 +47,12 @@ Requires-Dist: pytest; extra == "test"
46
47
  Requires-Dist: moto>=5; extra == "test"
47
48
  Provides-Extra: numpy
48
49
  Requires-Dist: numpy; extra == "numpy"
50
+ Provides-Extra: monitoring
51
+ Requires-Dist: psutil; extra == "monitoring"
52
+ Requires-Dist: intervaltree; extra == "monitoring"
53
+ Requires-Dist: matplotlib; extra == "monitoring"
54
+ Provides-Extra: apache
55
+ Requires-Dist: lxml; extra == "apache"
49
56
 
50
57
  [![PyPI version](https://badge.fury.io/py/cloud-files.svg)](https://badge.fury.io/py/cloud-files) [![Test Suite](https://github.com/seung-lab/cloud-files/workflows/Test%20Suite/badge.svg)](https://github.com/seung-lab/cloud-files/actions?query=workflow%3A%22Test+Suite%22)
51
58
 
@@ -97,8 +104,20 @@ cf.touch([ "example", "example2" ])
97
104
  cf = CloudFile("gs://bucket/file1")
98
105
  info = cf.head()
99
106
  binary = cf.get()
107
+ obj = cf.get_json()
100
108
  cf.put(binary)
109
+ cf.put_json()
101
110
  cf[:30] # get first 30 bytes of file
111
+
112
+ num_bytes = cf.size() # get size in bytes (also in head)
113
+ exists = cf.exists() # true or false
114
+ cf.delete() # deletes the file
115
+ cf.touch() # create the file if it doesn't exist
116
+ cf.move("gs://example/destination/directory") # copy then delete source
117
+ cf.transfer_from("gs://example/source/file.txt") # copies file efficiently
118
+ cf.transfer_to("gs://example/dest/file.txt") # copies file efficiently
119
+
120
+ path = cf.join([ path1, path2, path3 ]) # use the appropriate path separator
102
121
  ```
103
122
 
104
123
  CloudFiles was developed to access files from object storage without ever touching disk. The goal was to reliably and rapidly access a petabyte of image data broken down into tens to hundreds of millions of files being accessed in parallel across thousands of cores. CloudFiles has been used to processes dozens of images, many of which were in the hundreds of terabyte range. It has reliably read and written tens of billions of files to date.
@@ -124,6 +143,7 @@ CloudFiles was developed to access files from object storage without ever touchi
124
143
  ```bash
125
144
  pip install cloud-files
126
145
  pip install cloud-files[test] # to enable testing with pytest
146
+ pip install cloud-files[monitoring] # enable plotting network performance
127
147
  ```
128
148
 
129
149
  If you run into trouble installing dependenies, make sure you're using at least Python3.6 and you have updated pip. On Linux, some dependencies require manylinux2010 or manylinux2014 binaries which earlier versions of pip do not search for. MacOS, Linux, and Windows are supported platforms.
@@ -176,7 +196,9 @@ You can create the `google-secret.json` file [here](https://console.cloud.google
176
196
 
177
197
  ## API Documentation
178
198
 
179
- Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
199
+ Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
200
+
201
+ `CloudFile` is a more intuitive version of `CloudFiles` designed for managing single files instead of groups of files. See examples above. There is an analogus method for each `CloudFiles` method (where it makes sense).
180
202
 
181
203
  ### Constructor
182
204
  ```python
@@ -236,6 +258,10 @@ binary = cf['filename'][0:5] # same result, fetches 11 bytes
236
258
  >> b'hello' # represents byte range 0-4 inclusive of filename
237
259
 
238
260
  binaries = cf[:100] # download the first 100 files
261
+
262
+ # Get the TransmissionMonitor object that records
263
+ # the flight time of each file.
264
+ binaries, tm = cf.get(..., return_recording=True)
239
265
  ```
240
266
 
241
267
  `get` supports several different styles of input. The simplest takes a scalar filename and returns the contents of that file. However, you can also specify lists of filenames, a byte range request, and lists of byte range requests. You can provide a generator or iterator as input as well. Order is not guaranteed.
@@ -265,6 +291,10 @@ cf.puts([{
265
291
  cf.puts([ (path, content), (path, content) ], compression='gzip')
266
292
  cf.put_jsons(...)
267
293
 
294
+ # Get the TransmissionMonitor object that records the
295
+ # flight times of each object.
296
+ _, tm = cf.puts(..., return_recording=True)
297
+
268
298
  # Definition of put, put_json is identical
269
299
  def put(
270
300
  self,
@@ -469,6 +499,12 @@ cloudfiles -p 2 cp --progress -r s3://bkt/ gs://bkt2/
469
499
  cloudfiles cp -c br s3://bkt/file.txt gs://bkt2/
470
500
  # decompress
471
501
  cloudfiles cp -c none s3://bkt/file.txt gs://bkt2/
502
+ # save chart of file flight times
503
+ cloudfiles cp --flight-time s3://bkt/file.txt gs://bkt2/
504
+ # save a chart of estimated bandwidth usage from these files alone
505
+ cloudfiles cp --io-rate s3://bkt/file.txt gs://bkt2/
506
+ # save a chart of measured bandwidth usage for the machine
507
+ cloudfiles cp --machine-io-rate s3://bkt/file.txt gs://bkt2/
472
508
  # move or rename files
473
509
  cloudfiles mv s3://bkt/file.txt gs://bkt2/
474
510
  # create an empty file if not existing
@@ -528,6 +564,40 @@ cloudfiles alias rm example # remove example://
528
564
 
529
565
  The alias file is only accessed (and cached) if CloudFiles encounters an unknown protocol. If you stick to default protocols and use the syntax `s3://https://example.com/` for alternative endpoints, you can still use CloudFiles in environments without filesystem access.
530
566
 
567
+ ## Performance Monitoring
568
+
569
+ CloudFiles now comes with two tools inside of `cloudfiles.monitoring` for measuring the performance of transfer operations both via the CLI and the programatic interface.
570
+
571
+ A `TransmissionMonitor` object is created during each download or upload (e.g. `cf.get` or `cf.puts`) call. You can access this object by using the `return_recording=True` flag. This object saves the flight times of each object along with its size in an interval tree datastructure. It comes with methods for estimating the peak bits per a second and can plot both time of flight and the estimated transfer rates (assuming the transfer is evenly divided over the flight of an object, an assumption that is not always true). This allows you to estimate the contribution of a given CloudFiles operation to a machine's network IO.
572
+
573
+ ```python
574
+ from cloudfiles import CloudFiles
575
+
576
+ ...
577
+
578
+ results, tm = cf.get([ ... some files ... ], return_recording=True)
579
+
580
+ value = tm.peak_Mbps() # estimated peak transfer rate
581
+ value = tm.total_Mbps() # estimated average transfer rate
582
+ tm.plot_gantt() # time of flight chart
583
+ tm.plot_histogram() # transfer rate chart
584
+ ```
585
+
586
+ A second object, `IOSampler`, can sample the OS network counters using a background thread and provides a global view of the machine's network performance during the life of the transfer. It is enabled on the CLI for the `cp` command when the `--machine-io-rate` flag is enabled, but must be manually started programatically. This is to avoid accidentally starting unnecessary sampling threads. The samples are accumulated into a circular buffer, so make sure to set the buffer length long enough for your points of interest to be captured.
587
+
588
+ ```python
589
+ from cloudfiles.monitoring import IOSampler
590
+
591
+ sampler = IOSampler(buffer_sec=600, interval=0.25)
592
+ sampler.start_sampling()
593
+
594
+ ...
595
+
596
+
597
+ sampler.stop_sampling()
598
+ sampler.plot_histogram()
599
+ ```
600
+
531
601
  ## Credits
532
602
 
533
603
  CloudFiles is derived from the [CloudVolume.Storage](https://github.com/seung-lab/cloud-volume/tree/master/cloudvolume/storage) system.
@@ -48,8 +48,20 @@ cf.touch([ "example", "example2" ])
48
48
  cf = CloudFile("gs://bucket/file1")
49
49
  info = cf.head()
50
50
  binary = cf.get()
51
+ obj = cf.get_json()
51
52
  cf.put(binary)
53
+ cf.put_json()
52
54
  cf[:30] # get first 30 bytes of file
55
+
56
+ num_bytes = cf.size() # get size in bytes (also in head)
57
+ exists = cf.exists() # true or false
58
+ cf.delete() # deletes the file
59
+ cf.touch() # create the file if it doesn't exist
60
+ cf.move("gs://example/destination/directory") # copy then delete source
61
+ cf.transfer_from("gs://example/source/file.txt") # copies file efficiently
62
+ cf.transfer_to("gs://example/dest/file.txt") # copies file efficiently
63
+
64
+ path = cf.join([ path1, path2, path3 ]) # use the appropriate path separator
53
65
  ```
54
66
 
55
67
  CloudFiles was developed to access files from object storage without ever touching disk. The goal was to reliably and rapidly access a petabyte of image data broken down into tens to hundreds of millions of files being accessed in parallel across thousands of cores. CloudFiles has been used to processes dozens of images, many of which were in the hundreds of terabyte range. It has reliably read and written tens of billions of files to date.
@@ -75,6 +87,7 @@ CloudFiles was developed to access files from object storage without ever touchi
75
87
  ```bash
76
88
  pip install cloud-files
77
89
  pip install cloud-files[test] # to enable testing with pytest
90
+ pip install cloud-files[monitoring] # enable plotting network performance
78
91
  ```
79
92
 
80
93
  If you run into trouble installing dependenies, make sure you're using at least Python3.6 and you have updated pip. On Linux, some dependencies require manylinux2010 or manylinux2014 binaries which earlier versions of pip do not search for. MacOS, Linux, and Windows are supported platforms.
@@ -127,7 +140,9 @@ You can create the `google-secret.json` file [here](https://console.cloud.google
127
140
 
128
141
  ## API Documentation
129
142
 
130
- Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
143
+ Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
144
+
145
+ `CloudFile` is a more intuitive version of `CloudFiles` designed for managing single files instead of groups of files. See examples above. There is an analogus method for each `CloudFiles` method (where it makes sense).
131
146
 
132
147
  ### Constructor
133
148
  ```python
@@ -187,6 +202,10 @@ binary = cf['filename'][0:5] # same result, fetches 11 bytes
187
202
  >> b'hello' # represents byte range 0-4 inclusive of filename
188
203
 
189
204
  binaries = cf[:100] # download the first 100 files
205
+
206
+ # Get the TransmissionMonitor object that records
207
+ # the flight time of each file.
208
+ binaries, tm = cf.get(..., return_recording=True)
190
209
  ```
191
210
 
192
211
  `get` supports several different styles of input. The simplest takes a scalar filename and returns the contents of that file. However, you can also specify lists of filenames, a byte range request, and lists of byte range requests. You can provide a generator or iterator as input as well. Order is not guaranteed.
@@ -216,6 +235,10 @@ cf.puts([{
216
235
  cf.puts([ (path, content), (path, content) ], compression='gzip')
217
236
  cf.put_jsons(...)
218
237
 
238
+ # Get the TransmissionMonitor object that records the
239
+ # flight times of each object.
240
+ _, tm = cf.puts(..., return_recording=True)
241
+
219
242
  # Definition of put, put_json is identical
220
243
  def put(
221
244
  self,
@@ -420,6 +443,12 @@ cloudfiles -p 2 cp --progress -r s3://bkt/ gs://bkt2/
420
443
  cloudfiles cp -c br s3://bkt/file.txt gs://bkt2/
421
444
  # decompress
422
445
  cloudfiles cp -c none s3://bkt/file.txt gs://bkt2/
446
+ # save chart of file flight times
447
+ cloudfiles cp --flight-time s3://bkt/file.txt gs://bkt2/
448
+ # save a chart of estimated bandwidth usage from these files alone
449
+ cloudfiles cp --io-rate s3://bkt/file.txt gs://bkt2/
450
+ # save a chart of measured bandwidth usage for the machine
451
+ cloudfiles cp --machine-io-rate s3://bkt/file.txt gs://bkt2/
423
452
  # move or rename files
424
453
  cloudfiles mv s3://bkt/file.txt gs://bkt2/
425
454
  # create an empty file if not existing
@@ -479,6 +508,40 @@ cloudfiles alias rm example # remove example://
479
508
 
480
509
  The alias file is only accessed (and cached) if CloudFiles encounters an unknown protocol. If you stick to default protocols and use the syntax `s3://https://example.com/` for alternative endpoints, you can still use CloudFiles in environments without filesystem access.
481
510
 
511
+ ## Performance Monitoring
512
+
513
+ CloudFiles now comes with two tools inside of `cloudfiles.monitoring` for measuring the performance of transfer operations both via the CLI and the programatic interface.
514
+
515
+ A `TransmissionMonitor` object is created during each download or upload (e.g. `cf.get` or `cf.puts`) call. You can access this object by using the `return_recording=True` flag. This object saves the flight times of each object along with its size in an interval tree datastructure. It comes with methods for estimating the peak bits per a second and can plot both time of flight and the estimated transfer rates (assuming the transfer is evenly divided over the flight of an object, an assumption that is not always true). This allows you to estimate the contribution of a given CloudFiles operation to a machine's network IO.
516
+
517
+ ```python
518
+ from cloudfiles import CloudFiles
519
+
520
+ ...
521
+
522
+ results, tm = cf.get([ ... some files ... ], return_recording=True)
523
+
524
+ value = tm.peak_Mbps() # estimated peak transfer rate
525
+ value = tm.total_Mbps() # estimated average transfer rate
526
+ tm.plot_gantt() # time of flight chart
527
+ tm.plot_histogram() # transfer rate chart
528
+ ```
529
+
530
+ A second object, `IOSampler`, can sample the OS network counters using a background thread and provides a global view of the machine's network performance during the life of the transfer. It is enabled on the CLI for the `cp` command when the `--machine-io-rate` flag is enabled, but must be manually started programatically. This is to avoid accidentally starting unnecessary sampling threads. The samples are accumulated into a circular buffer, so make sure to set the buffer length long enough for your points of interest to be captured.
531
+
532
+ ```python
533
+ from cloudfiles.monitoring import IOSampler
534
+
535
+ sampler = IOSampler(buffer_sec=600, interval=0.25)
536
+ sampler.start_sampling()
537
+
538
+ ...
539
+
540
+
541
+ sampler.stop_sampling()
542
+ sampler.plot_histogram()
543
+ ```
544
+
482
545
  ## Credits
483
546
 
484
547
  CloudFiles is derived from the [CloudVolume.Storage](https://github.com/seung-lab/cloud-volume/tree/master/cloudvolume/storage) system.
@@ -4,6 +4,13 @@ import re
4
4
  import shutil
5
5
  import time
6
6
 
7
+
8
+ import uuid
9
+ from typing import Optional
10
+ from unittest.mock import patch, MagicMock
11
+ import numpy as np
12
+ from cloudfiles.monitoring import TransmissionMonitor, IOSampler, IOEnum
13
+
7
14
  from moto import mock_aws
8
15
 
9
16
  COMPRESSION_TYPES = [
@@ -1259,3 +1266,117 @@ def test_touch(s3, protocol):
1259
1266
  cf.touch([ str(i) for i in range(20) ])
1260
1267
 
1261
1268
  assert sorted(list(cf)) == sorted([ str(i) for i in range(20) ])
1269
+
1270
+ class TestTransmissionMonitor:
1271
+ @pytest.fixture
1272
+ def tx_monitor(self):
1273
+ return TransmissionMonitor(IOEnum.TX)
1274
+
1275
+ @pytest.fixture
1276
+ def rx_monitor(self):
1277
+ return TransmissionMonitor(IOEnum.RX)
1278
+
1279
+ def test_init(self, tx_monitor):
1280
+ assert tx_monitor._direction == IOEnum.TX
1281
+ assert tx_monitor._total_bytes_landed == 0
1282
+ assert len(tx_monitor._in_flight) == 0
1283
+ assert len(tx_monitor._errors) == 0
1284
+ assert tx_monitor._in_flight_bytes == 0
1285
+
1286
+ def test_start_io(self, tx_monitor):
1287
+ flight_id = tx_monitor.start_io(100)
1288
+ assert isinstance(flight_id, uuid.UUID)
1289
+ assert len(tx_monitor._in_flight) == 1
1290
+ assert tx_monitor._in_flight_bytes == 100
1291
+
1292
+ def test_end_io(self, tx_monitor):
1293
+ flight_id = tx_monitor.start_io(100)
1294
+ time.sleep(0.01)
1295
+ tx_monitor.end_io(flight_id, 100)
1296
+
1297
+ assert len(tx_monitor._in_flight) == 0
1298
+ assert tx_monitor._in_flight_bytes == 0
1299
+ assert tx_monitor._total_bytes_landed == 100
1300
+ assert len(tx_monitor._intervaltree) == 1
1301
+
1302
+ def test_end_error(self, tx_monitor):
1303
+ flight_id = tx_monitor.start_io(100)
1304
+ tx_monitor.end_error(flight_id)
1305
+
1306
+ assert flight_id in tx_monitor._errors
1307
+ assert len(tx_monitor._in_flight) == 1 # Still in flight
1308
+
1309
+ def test_total_bytes(self, tx_monitor):
1310
+ flight_id1 = tx_monitor.start_io(100)
1311
+ flight_id2 = tx_monitor.start_io(200)
1312
+ tx_monitor.end_io(flight_id1, 100)
1313
+ tx_monitor.end_io(flight_id2, 200)
1314
+
1315
+ assert tx_monitor.total_bytes() == 300
1316
+
1317
+ def test_total_bps(self, tx_monitor):
1318
+ flight_id = tx_monitor.start_io(100)
1319
+ time.sleep(1.0)
1320
+ tx_monitor.end_io(flight_id, 100)
1321
+
1322
+ bps = tx_monitor.total_bps()
1323
+ assert pytest.approx(bps, rel=0.1) == 800 # 100 bytes * 8 bits/byte / 1 sec
1324
+
1325
+ def test_current_bps(self, tx_monitor):
1326
+ # Test with lookback window
1327
+ flight_id = tx_monitor.start_io(100)
1328
+ time.sleep(1.0)
1329
+ tx_monitor.end_io(flight_id, 100)
1330
+
1331
+ bps = tx_monitor.current_bps(look_back_sec=0.5)
1332
+ assert bps > 0
1333
+
1334
+ def test_peak_bps(self, tx_monitor):
1335
+ flight_id1 = tx_monitor.start_io(100)
1336
+ time.sleep(0.5)
1337
+ tx_monitor.end_io(flight_id1, 100)
1338
+
1339
+ flight_id2 = tx_monitor.start_io(200)
1340
+ time.sleep(0.5)
1341
+ tx_monitor.end_io(flight_id2, 200)
1342
+
1343
+ peak = tx_monitor.peak_bps()
1344
+ assert peak > 0
1345
+
1346
+ def test_histogram(self, tx_monitor):
1347
+ flight_id = tx_monitor.start_io(100)
1348
+ time.sleep(1.0)
1349
+ tx_monitor.end_io(flight_id, 100)
1350
+
1351
+ hist = tx_monitor.histogram(resolution=1.0)
1352
+ assert len(hist) > 0
1353
+ assert np.sum(hist) == 100
1354
+
1355
+ def test_merge(self, tx_monitor):
1356
+ monitor1 = TransmissionMonitor(IOEnum.TX)
1357
+ monitor2 = TransmissionMonitor(IOEnum.TX)
1358
+
1359
+ flight_id1 = monitor1.start_io(100)
1360
+ time.sleep(0.1)
1361
+ monitor1.end_io(flight_id1, 100)
1362
+
1363
+ flight_id2 = monitor2.start_io(200)
1364
+ time.sleep(0.1)
1365
+ monitor2.end_io(flight_id2, 200)
1366
+
1367
+ merged = TransmissionMonitor.merge([monitor1, monitor2])
1368
+
1369
+ assert merged.total_bytes() == 300
1370
+ assert len(merged._intervaltree) == 2
1371
+
1372
+ def test_serialization(self, tx_monitor):
1373
+ flight_id = tx_monitor.start_io(100)
1374
+ time.sleep(0.1)
1375
+ tx_monitor.end_io(flight_id, 100)
1376
+
1377
+ import pickle
1378
+ data = pickle.dumps(tx_monitor)
1379
+ new_monitor = pickle.loads(data)
1380
+
1381
+ assert new_monitor.total_bytes() == 100
1382
+ assert hasattr(new_monitor, '_lock')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cloud-files
3
- Version: 5.4.1
3
+ Version: 5.6.0
4
4
  Summary: Fast access to cloud storage and local FS.
5
5
  Home-page: https://github.com/seung-lab/cloud-files/
6
6
  Author: William Silversmith
@@ -30,6 +30,7 @@ Requires-Dist: google-auth>=1.10.0
30
30
  Requires-Dist: google-cloud-core>=1.1.0
31
31
  Requires-Dist: google-cloud-storage>=1.31.1
32
32
  Requires-Dist: google-crc32c>=1.0.0
33
+ Requires-Dist: intervaltree
33
34
  Requires-Dist: orjson
34
35
  Requires-Dist: pathos
35
36
  Requires-Dist: protobuf>=3.3.0
@@ -46,6 +47,12 @@ Requires-Dist: pytest; extra == "test"
46
47
  Requires-Dist: moto>=5; extra == "test"
47
48
  Provides-Extra: numpy
48
49
  Requires-Dist: numpy; extra == "numpy"
50
+ Provides-Extra: monitoring
51
+ Requires-Dist: psutil; extra == "monitoring"
52
+ Requires-Dist: intervaltree; extra == "monitoring"
53
+ Requires-Dist: matplotlib; extra == "monitoring"
54
+ Provides-Extra: apache
55
+ Requires-Dist: lxml; extra == "apache"
49
56
 
50
57
  [![PyPI version](https://badge.fury.io/py/cloud-files.svg)](https://badge.fury.io/py/cloud-files) [![Test Suite](https://github.com/seung-lab/cloud-files/workflows/Test%20Suite/badge.svg)](https://github.com/seung-lab/cloud-files/actions?query=workflow%3A%22Test+Suite%22)
51
58
 
@@ -97,8 +104,20 @@ cf.touch([ "example", "example2" ])
97
104
  cf = CloudFile("gs://bucket/file1")
98
105
  info = cf.head()
99
106
  binary = cf.get()
107
+ obj = cf.get_json()
100
108
  cf.put(binary)
109
+ cf.put_json()
101
110
  cf[:30] # get first 30 bytes of file
111
+
112
+ num_bytes = cf.size() # get size in bytes (also in head)
113
+ exists = cf.exists() # true or false
114
+ cf.delete() # deletes the file
115
+ cf.touch() # create the file if it doesn't exist
116
+ cf.move("gs://example/destination/directory") # copy then delete source
117
+ cf.transfer_from("gs://example/source/file.txt") # copies file efficiently
118
+ cf.transfer_to("gs://example/dest/file.txt") # copies file efficiently
119
+
120
+ path = cf.join([ path1, path2, path3 ]) # use the appropriate path separator
102
121
  ```
103
122
 
104
123
  CloudFiles was developed to access files from object storage without ever touching disk. The goal was to reliably and rapidly access a petabyte of image data broken down into tens to hundreds of millions of files being accessed in parallel across thousands of cores. CloudFiles has been used to processes dozens of images, many of which were in the hundreds of terabyte range. It has reliably read and written tens of billions of files to date.
@@ -124,6 +143,7 @@ CloudFiles was developed to access files from object storage without ever touchi
124
143
  ```bash
125
144
  pip install cloud-files
126
145
  pip install cloud-files[test] # to enable testing with pytest
146
+ pip install cloud-files[monitoring] # enable plotting network performance
127
147
  ```
128
148
 
129
149
  If you run into trouble installing dependenies, make sure you're using at least Python3.6 and you have updated pip. On Linux, some dependencies require manylinux2010 or manylinux2014 binaries which earlier versions of pip do not search for. MacOS, Linux, and Windows are supported platforms.
@@ -176,7 +196,9 @@ You can create the `google-secret.json` file [here](https://console.cloud.google
176
196
 
177
197
  ## API Documentation
178
198
 
179
- Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
199
+ Note that the "Cloud Costs" mentioned below are current as of June 2020 and are subject to change. As of this writing, S3 and Google use identical cost structures for these operations.
200
+
201
+ `CloudFile` is a more intuitive version of `CloudFiles` designed for managing single files instead of groups of files. See examples above. There is an analogus method for each `CloudFiles` method (where it makes sense).
180
202
 
181
203
  ### Constructor
182
204
  ```python
@@ -236,6 +258,10 @@ binary = cf['filename'][0:5] # same result, fetches 11 bytes
236
258
  >> b'hello' # represents byte range 0-4 inclusive of filename
237
259
 
238
260
  binaries = cf[:100] # download the first 100 files
261
+
262
+ # Get the TransmissionMonitor object that records
263
+ # the flight time of each file.
264
+ binaries, tm = cf.get(..., return_recording=True)
239
265
  ```
240
266
 
241
267
  `get` supports several different styles of input. The simplest takes a scalar filename and returns the contents of that file. However, you can also specify lists of filenames, a byte range request, and lists of byte range requests. You can provide a generator or iterator as input as well. Order is not guaranteed.
@@ -265,6 +291,10 @@ cf.puts([{
265
291
  cf.puts([ (path, content), (path, content) ], compression='gzip')
266
292
  cf.put_jsons(...)
267
293
 
294
+ # Get the TransmissionMonitor object that records the
295
+ # flight times of each object.
296
+ _, tm = cf.puts(..., return_recording=True)
297
+
268
298
  # Definition of put, put_json is identical
269
299
  def put(
270
300
  self,
@@ -469,6 +499,12 @@ cloudfiles -p 2 cp --progress -r s3://bkt/ gs://bkt2/
469
499
  cloudfiles cp -c br s3://bkt/file.txt gs://bkt2/
470
500
  # decompress
471
501
  cloudfiles cp -c none s3://bkt/file.txt gs://bkt2/
502
+ # save chart of file flight times
503
+ cloudfiles cp --flight-time s3://bkt/file.txt gs://bkt2/
504
+ # save a chart of estimated bandwidth usage from these files alone
505
+ cloudfiles cp --io-rate s3://bkt/file.txt gs://bkt2/
506
+ # save a chart of measured bandwidth usage for the machine
507
+ cloudfiles cp --machine-io-rate s3://bkt/file.txt gs://bkt2/
472
508
  # move or rename files
473
509
  cloudfiles mv s3://bkt/file.txt gs://bkt2/
474
510
  # create an empty file if not existing
@@ -528,6 +564,40 @@ cloudfiles alias rm example # remove example://
528
564
 
529
565
  The alias file is only accessed (and cached) if CloudFiles encounters an unknown protocol. If you stick to default protocols and use the syntax `s3://https://example.com/` for alternative endpoints, you can still use CloudFiles in environments without filesystem access.
530
566
 
567
+ ## Performance Monitoring
568
+
569
+ CloudFiles now comes with two tools inside of `cloudfiles.monitoring` for measuring the performance of transfer operations both via the CLI and the programatic interface.
570
+
571
+ A `TransmissionMonitor` object is created during each download or upload (e.g. `cf.get` or `cf.puts`) call. You can access this object by using the `return_recording=True` flag. This object saves the flight times of each object along with its size in an interval tree datastructure. It comes with methods for estimating the peak bits per a second and can plot both time of flight and the estimated transfer rates (assuming the transfer is evenly divided over the flight of an object, an assumption that is not always true). This allows you to estimate the contribution of a given CloudFiles operation to a machine's network IO.
572
+
573
+ ```python
574
+ from cloudfiles import CloudFiles
575
+
576
+ ...
577
+
578
+ results, tm = cf.get([ ... some files ... ], return_recording=True)
579
+
580
+ value = tm.peak_Mbps() # estimated peak transfer rate
581
+ value = tm.total_Mbps() # estimated average transfer rate
582
+ tm.plot_gantt() # time of flight chart
583
+ tm.plot_histogram() # transfer rate chart
584
+ ```
585
+
586
+ A second object, `IOSampler`, can sample the OS network counters using a background thread and provides a global view of the machine's network performance during the life of the transfer. It is enabled on the CLI for the `cp` command when the `--machine-io-rate` flag is enabled, but must be manually started programatically. This is to avoid accidentally starting unnecessary sampling threads. The samples are accumulated into a circular buffer, so make sure to set the buffer length long enough for your points of interest to be captured.
587
+
588
+ ```python
589
+ from cloudfiles.monitoring import IOSampler
590
+
591
+ sampler = IOSampler(buffer_sec=600, interval=0.25)
592
+ sampler.start_sampling()
593
+
594
+ ...
595
+
596
+
597
+ sampler.stop_sampling()
598
+ sampler.plot_histogram()
599
+ ```
600
+
531
601
  ## Credits
532
602
 
533
603
  CloudFiles is derived from the [CloudVolume.Storage](https://github.com/seung-lab/cloud-volume/tree/master/cloudvolume/storage) system.
@@ -24,6 +24,7 @@ cloudfiles/exceptions.py
24
24
  cloudfiles/gcs.py
25
25
  cloudfiles/interfaces.py
26
26
  cloudfiles/lib.py
27
+ cloudfiles/monitoring.py
27
28
  cloudfiles/paths.py
28
29
  cloudfiles/resumable_tools.py
29
30
  cloudfiles/scheduler.py
@@ -0,0 +1 @@
1
+ {"git_version": "4a9dd39", "is_release": true}
@@ -9,6 +9,7 @@ google-auth>=1.10.0
9
9
  google-cloud-core>=1.1.0
10
10
  google-cloud-storage>=1.31.1
11
11
  google-crc32c>=1.0.0
12
+ intervaltree
12
13
  orjson
13
14
  pathos
14
15
  protobuf>=3.3.0
@@ -21,6 +22,14 @@ zstandard
21
22
  rsa>=4.7.2
22
23
  fasteners
23
24
 
25
+ [apache]
26
+ lxml
27
+
28
+ [monitoring]
29
+ psutil
30
+ intervaltree
31
+ matplotlib
32
+
24
33
  [numpy]
25
34
  numpy
26
35