rabbit-slide-kou-scipy-japan-2020 2020.10.30.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rabbit +1 -0
- data/README.rd +65 -0
- data/Rakefile +18 -0
- data/Rplots.pdf +0 -0
- data/apache-arrow-1.0.rab +921 -0
- data/config.yaml +23 -0
- data/images/amazon-athena-improvement.svg +54 -0
- data/images/apache-arrow-and-amazon-athena.svg +1584 -0
- data/images/apache-arrow-and-apache-parquet.svg +1196 -0
- data/images/apache-arrow-and-apache-spark.svg +1320 -0
- data/images/apache-arrow-and-data-interchange.svg +844 -0
- data/images/apache-spark-improvement.svg +58 -0
- data/images/arrow.svg +25 -0
- data/images/benchmark-data-interchange-apache-arrow-apache-parquet.svg +1214 -0
- data/images/benchmark-data-interchange-apache-arrow-csv.svg +1097 -0
- data/images/benchmark-data-interchange-apache-arrow-numpy.svg +1195 -0
- data/images/contributor.png +0 -0
- data/images/memory-mapping.svg +236 -0
- data/images/noun_File_3524817.svg +1 -0
- data/images/noun_Memory_2294239.svg +1 -0
- data/images/parquet.svg +18 -0
- data/images/record-batch.svg +361 -0
- data/pdf/scipy-japan-2020-apache-arrow-1.0.pdf +0 -0
- data/theme.rb +2 -0
- data/tools/amazon-athena-improvement.R +12 -0
- data/tools/apache-spark-improvement.R +11 -0
- data/tools/benchmark-data-interchange.py +125 -0
- metadata +104 -0
Binary file
|
data/theme.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
data_frame = data.frame(method=c("Athena + CSV on S3 Query",
|
2
|
+
"Federated S3 Query w/Apache Arrow"),
|
3
|
+
throughput=c(0.12, 1.5))
|
4
|
+
ggplot2::ggplot(data_frame) +
|
5
|
+
ggplot2::ggtitle("Apache Arrow improves data interchange performance") +
|
6
|
+
ggplot2::labs(x="Throughput in billion rows/sec (Longer is faster)",
|
7
|
+
y="Method",
|
8
|
+
caption="Data at https://github.com/awslabs/aws-athena-query-federation/tree/master/athena-federation-sdk#performance") +
|
9
|
+
ggplot2::geom_bar(ggplot2::aes(y=method, weight=throughput)) +
|
10
|
+
ggplot2::ggsave("images/amazon-athena-improvement.svg",
|
11
|
+
dpi=100,
|
12
|
+
height=3.8)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
data_frame = data.frame(case=c("pickle", "Apache Arrow"),
|
2
|
+
elapsed=c(20.7, 0.737))
|
3
|
+
ggplot2::ggplot(data_frame) +
|
4
|
+
ggplot2::ggtitle("Apache Arrow improves data interchange performance") +
|
5
|
+
ggplot2::labs(x="Elapsed time in seconds (Shorter is faster)",
|
6
|
+
y="Format",
|
7
|
+
caption="Data at https://arrow.apache.org/blog/2017/07/26/spark-arrow/") +
|
8
|
+
ggplot2::geom_bar(ggplot2::aes(y=case, weight=elapsed)) +
|
9
|
+
ggplot2::ggsave("images/apache-spark-improvement.svg",
|
10
|
+
dpi=100,
|
11
|
+
height=3.8)
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
import datetime
|
4
|
+
import io
|
5
|
+
import numpy as np
|
6
|
+
import pandas as pd
|
7
|
+
import pyarrow as pa
|
8
|
+
import pyarrow.parquet as pq
|
9
|
+
import matplotlib.pyplot as plt
|
10
|
+
|
11
|
+
n_columns = 10
|
12
|
+
n_records = 1_000_000
|
13
|
+
# n_records = 1000
|
14
|
+
data = np.random.randint(0, 1000, [n_records, n_columns])
|
15
|
+
|
16
|
+
csv_output = io.StringIO()
|
17
|
+
before = datetime.datetime.now()
|
18
|
+
np.savetxt(csv_output, data, '%d', ',')
|
19
|
+
csv_generate_elapsed_time = datetime.datetime.now() - before
|
20
|
+
csv = csv_output.getvalue()
|
21
|
+
print(f'Generate CSV: {csv_generate_elapsed_time.total_seconds()}')
|
22
|
+
|
23
|
+
csv_input = io.StringIO(csv)
|
24
|
+
before = datetime.datetime.now()
|
25
|
+
np.loadtxt(csv_input, int, delimiter=',')
|
26
|
+
csv_load_elapsed_time = datetime.datetime.now() - before
|
27
|
+
print(f'Load CSV: {csv_load_elapsed_time.total_seconds()}')
|
28
|
+
|
29
|
+
|
30
|
+
npy_output = io.BytesIO()
|
31
|
+
before = datetime.datetime.now()
|
32
|
+
np.save(npy_output, data)
|
33
|
+
npy_generate_elapsed_time = datetime.datetime.now() - before
|
34
|
+
npy = npy_output.getvalue()
|
35
|
+
print(f'Generate NumPy: {npy_generate_elapsed_time.total_seconds()}')
|
36
|
+
|
37
|
+
npy_input = io.BytesIO(npy)
|
38
|
+
before = datetime.datetime.now()
|
39
|
+
np.load(npy_input)
|
40
|
+
npy_load_elapsed_time = datetime.datetime.now() - before
|
41
|
+
print(f'Load NumPy: {npy_load_elapsed_time.total_seconds()}')
|
42
|
+
|
43
|
+
|
44
|
+
before = datetime.datetime.now()
|
45
|
+
df = pd.DataFrame(data)
|
46
|
+
table = pa.Table.from_pandas(df)
|
47
|
+
parquet_output = pa.BufferOutputStream()
|
48
|
+
writer = pq.ParquetWriter(parquet_output, table.schema)
|
49
|
+
writer.write_table(table)
|
50
|
+
writer.close()
|
51
|
+
parquet = parquet_output.getvalue()
|
52
|
+
parquet_generate_elapsed_time = datetime.datetime.now() - before
|
53
|
+
print(f'Generate Apache Parquet: {parquet_generate_elapsed_time.total_seconds()}')
|
54
|
+
|
55
|
+
before = datetime.datetime.now()
|
56
|
+
parquet_input = pa.BufferReader(parquet)
|
57
|
+
reader = pq.ParquetFile(parquet_input)
|
58
|
+
table = reader.read()
|
59
|
+
df = table.to_pandas()
|
60
|
+
df.to_numpy()
|
61
|
+
parquet_load_elapsed_time = datetime.datetime.now() - before
|
62
|
+
print(f'Load Apache Parquet: {parquet_load_elapsed_time.total_seconds()}')
|
63
|
+
|
64
|
+
|
65
|
+
before = datetime.datetime.now()
|
66
|
+
df = pd.DataFrame(data)
|
67
|
+
table = pa.Table.from_pandas(df)
|
68
|
+
arrow_output = pa.BufferOutputStream()
|
69
|
+
writer = pa.ipc.new_file(arrow_output, table.schema)
|
70
|
+
writer.write_table(table)
|
71
|
+
writer.close()
|
72
|
+
arrow = arrow_output.getvalue()
|
73
|
+
arrow_generate_elapsed_time = datetime.datetime.now() - before
|
74
|
+
print(f'Generate Apache Arrow: {arrow_generate_elapsed_time.total_seconds()}')
|
75
|
+
|
76
|
+
before = datetime.datetime.now()
|
77
|
+
arrow_input = pa.BufferReader(arrow)
|
78
|
+
reader = pa.ipc.open_file(arrow_input)
|
79
|
+
df = reader.read_pandas()
|
80
|
+
df.to_numpy()
|
81
|
+
arrow_load_elapsed_time = datetime.datetime.now() - before
|
82
|
+
print(f'Load Apache Arrow: {arrow_load_elapsed_time.total_seconds()}')
|
83
|
+
|
84
|
+
|
85
|
+
labels = ['Generate', 'Load']
|
86
|
+
csv_elapsed_times = [
|
87
|
+
csv_generate_elapsed_time.total_seconds(),
|
88
|
+
csv_load_elapsed_time.total_seconds(),
|
89
|
+
]
|
90
|
+
npy_elapsed_times = [
|
91
|
+
npy_generate_elapsed_time.total_seconds(),
|
92
|
+
npy_load_elapsed_time.total_seconds(),
|
93
|
+
]
|
94
|
+
parquet_elapsed_times = [
|
95
|
+
parquet_generate_elapsed_time.total_seconds(),
|
96
|
+
parquet_load_elapsed_time.total_seconds(),
|
97
|
+
]
|
98
|
+
arrow_elapsed_times = [
|
99
|
+
arrow_generate_elapsed_time.total_seconds(),
|
100
|
+
arrow_load_elapsed_time.total_seconds(),
|
101
|
+
]
|
102
|
+
|
103
|
+
y = np.arange(len(labels))
|
104
|
+
width = 0.35
|
105
|
+
|
106
|
+
competities = [
|
107
|
+
['csv', 'CSV', csv_elapsed_times],
|
108
|
+
['numpy', 'NumPy', npy_elapsed_times],
|
109
|
+
['apache-parquet', 'Apache Parquet', parquet_elapsed_times],
|
110
|
+
]
|
111
|
+
for id, label, elapsed_times in competities:
|
112
|
+
fig, ax = plt.subplots()
|
113
|
+
ax.barh(y - width / 2, elapsed_times, width, label=label)
|
114
|
+
ax.barh(y + width / 2, arrow_elapsed_times, width, label='Apache Arrow')
|
115
|
+
|
116
|
+
ax.set_xlabel(f'Elapsed time (second) ({n_records} records) (Shorter is faster)')
|
117
|
+
ax.set_title('Apache Arrow improves data interchange performance')
|
118
|
+
ax.set_yticks(y)
|
119
|
+
ax.set_yticklabels(labels)
|
120
|
+
ax.invert_yaxis()
|
121
|
+
ax.legend()
|
122
|
+
|
123
|
+
fig.tight_layout()
|
124
|
+
|
125
|
+
fig.savefig(f'images/benchmark-data-interchange-apache-arrow-{id}.svg')
|
metadata
ADDED
@@ -0,0 +1,104 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rabbit-slide-kou-scipy-japan-2020
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 2020.10.30.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Sutou Kouhei
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-09-24 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rabbit
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.0.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.0.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rabbit-theme-clear-code
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: |-
|
42
|
+
Apache Arrow is a cross-language development platform for in-memory
|
43
|
+
data. You can use Apache Arrow to process large data effectively in
|
44
|
+
Python and other languages such as R. Apache Arrow is the future of
|
45
|
+
data processing. Apache Arrow 1.0, the first major version, was
|
46
|
+
released at 2020-07-24. It's a good time to know Apache Arrow and
|
47
|
+
start using it.
|
48
|
+
email:
|
49
|
+
- kou@clear-code.com
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- ".rabbit"
|
55
|
+
- README.rd
|
56
|
+
- Rakefile
|
57
|
+
- Rplots.pdf
|
58
|
+
- apache-arrow-1.0.rab
|
59
|
+
- config.yaml
|
60
|
+
- images/amazon-athena-improvement.svg
|
61
|
+
- images/apache-arrow-and-amazon-athena.svg
|
62
|
+
- images/apache-arrow-and-apache-parquet.svg
|
63
|
+
- images/apache-arrow-and-apache-spark.svg
|
64
|
+
- images/apache-arrow-and-data-interchange.svg
|
65
|
+
- images/apache-spark-improvement.svg
|
66
|
+
- images/arrow.svg
|
67
|
+
- images/benchmark-data-interchange-apache-arrow-apache-parquet.svg
|
68
|
+
- images/benchmark-data-interchange-apache-arrow-csv.svg
|
69
|
+
- images/benchmark-data-interchange-apache-arrow-numpy.svg
|
70
|
+
- images/contributor.png
|
71
|
+
- images/memory-mapping.svg
|
72
|
+
- images/noun_File_3524817.svg
|
73
|
+
- images/noun_Memory_2294239.svg
|
74
|
+
- images/parquet.svg
|
75
|
+
- images/record-batch.svg
|
76
|
+
- pdf/scipy-japan-2020-apache-arrow-1.0.pdf
|
77
|
+
- theme.rb
|
78
|
+
- tools/amazon-athena-improvement.R
|
79
|
+
- tools/apache-spark-improvement.R
|
80
|
+
- tools/benchmark-data-interchange.py
|
81
|
+
homepage: https://slide.rabbit-shocker.org/authors/kou/scipy-japan-2020/
|
82
|
+
licenses:
|
83
|
+
- CC-BY-SA-4.0
|
84
|
+
metadata: {}
|
85
|
+
post_install_message:
|
86
|
+
rdoc_options: []
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: '0'
|
99
|
+
requirements: []
|
100
|
+
rubygems_version: 3.2.0.rc.1
|
101
|
+
signing_key:
|
102
|
+
specification_version: 4
|
103
|
+
summary: Apache Arrow 1.0 - A cross-language development platform for in-memory data
|
104
|
+
test_files: []
|