rabbit-slide-kou-scipy-japan-2020 2020.10.30.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ @clear_code_font_family ||= find_font_family("モトヤLマルベリ3等幅")
2
+ include_theme("clear-code")
@@ -0,0 +1,12 @@
1
+ data_frame = data.frame(method=c("Athena + CSV on S3 Query",
2
+ "Federated S3 Query w/Apache Arrow"),
3
+ throughput=c(0.12, 1.5))
4
+ ggplot2::ggplot(data_frame) +
5
+ ggplot2::ggtitle("Apache Arrow improves data interchange performance") +
6
+ ggplot2::labs(x="Throughput in billion rows/sec (Longer is faster)",
7
+ y="Method",
8
+ caption="Data at https://github.com/awslabs/aws-athena-query-federation/tree/master/athena-federation-sdk#performance") +
9
+ ggplot2::geom_bar(ggplot2::aes(y=method, weight=throughput)) +
10
+ ggplot2::ggsave("images/amazon-athena-improvement.svg",
11
+ dpi=100,
12
+ height=3.8)
@@ -0,0 +1,11 @@
1
+ data_frame = data.frame(case=c("pickle", "Apache Arrow"),
2
+ elapsed=c(20.7, 0.737))
3
+ ggplot2::ggplot(data_frame) +
4
+ ggplot2::ggtitle("Apache Arrow improves data interchange performance") +
5
+ ggplot2::labs(x="Elapsed time in seconds (Shorter is faster)",
6
+ y="Format",
7
+ caption="Data at https://arrow.apache.org/blog/2017/07/26/spark-arrow/") +
8
+ ggplot2::geom_bar(ggplot2::aes(y=case, weight=elapsed)) +
9
+ ggplot2::ggsave("images/apache-spark-improvement.svg",
10
+ dpi=100,
11
+ height=3.8)
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import datetime
4
+ import io
5
+ import numpy as np
6
+ import pandas as pd
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ import matplotlib.pyplot as plt
10
+
11
+ n_columns = 10
12
+ n_records = 1_000_000
13
+ # n_records = 1000
14
+ data = np.random.randint(0, 1000, [n_records, n_columns])
15
+
16
+ csv_output = io.StringIO()
17
+ before = datetime.datetime.now()
18
+ np.savetxt(csv_output, data, '%d', ',')
19
+ csv_generate_elapsed_time = datetime.datetime.now() - before
20
+ csv = csv_output.getvalue()
21
+ print(f'Generate CSV: {csv_generate_elapsed_time.total_seconds()}')
22
+
23
+ csv_input = io.StringIO(csv)
24
+ before = datetime.datetime.now()
25
+ np.loadtxt(csv_input, int, delimiter=',')
26
+ csv_load_elapsed_time = datetime.datetime.now() - before
27
+ print(f'Load CSV: {csv_load_elapsed_time.total_seconds()}')
28
+
29
+
30
+ npy_output = io.BytesIO()
31
+ before = datetime.datetime.now()
32
+ np.save(npy_output, data)
33
+ npy_generate_elapsed_time = datetime.datetime.now() - before
34
+ npy = npy_output.getvalue()
35
+ print(f'Generate NumPy: {npy_generate_elapsed_time.total_seconds()}')
36
+
37
+ npy_input = io.BytesIO(npy)
38
+ before = datetime.datetime.now()
39
+ np.load(npy_input)
40
+ npy_load_elapsed_time = datetime.datetime.now() - before
41
+ print(f'Load NumPy: {npy_load_elapsed_time.total_seconds()}')
42
+
43
+
44
+ before = datetime.datetime.now()
45
+ df = pd.DataFrame(data)
46
+ table = pa.Table.from_pandas(df)
47
+ parquet_output = pa.BufferOutputStream()
48
+ writer = pq.ParquetWriter(parquet_output, table.schema)
49
+ writer.write_table(table)
50
+ writer.close()
51
+ parquet = parquet_output.getvalue()
52
+ parquet_generate_elapsed_time = datetime.datetime.now() - before
53
+ print(f'Generate Apache Parquet: {parquet_generate_elapsed_time.total_seconds()}')
54
+
55
+ before = datetime.datetime.now()
56
+ parquet_input = pa.BufferReader(parquet)
57
+ reader = pq.ParquetFile(parquet_input)
58
+ table = reader.read()
59
+ df = table.to_pandas()
60
+ df.to_numpy()
61
+ parquet_load_elapsed_time = datetime.datetime.now() - before
62
+ print(f'Load Apache Parquet: {parquet_load_elapsed_time.total_seconds()}')
63
+
64
+
65
+ before = datetime.datetime.now()
66
+ df = pd.DataFrame(data)
67
+ table = pa.Table.from_pandas(df)
68
+ arrow_output = pa.BufferOutputStream()
69
+ writer = pa.ipc.new_file(arrow_output, table.schema)
70
+ writer.write_table(table)
71
+ writer.close()
72
+ arrow = arrow_output.getvalue()
73
+ arrow_generate_elapsed_time = datetime.datetime.now() - before
74
+ print(f'Generate Apache Arrow: {arrow_generate_elapsed_time.total_seconds()}')
75
+
76
+ before = datetime.datetime.now()
77
+ arrow_input = pa.BufferReader(arrow)
78
+ reader = pa.ipc.open_file(arrow_input)
79
+ df = reader.read_pandas()
80
+ df.to_numpy()
81
+ arrow_load_elapsed_time = datetime.datetime.now() - before
82
+ print(f'Load Apache Arrow: {arrow_load_elapsed_time.total_seconds()}')
83
+
84
+
85
+ labels = ['Generate', 'Load']
86
+ csv_elapsed_times = [
87
+ csv_generate_elapsed_time.total_seconds(),
88
+ csv_load_elapsed_time.total_seconds(),
89
+ ]
90
+ npy_elapsed_times = [
91
+ npy_generate_elapsed_time.total_seconds(),
92
+ npy_load_elapsed_time.total_seconds(),
93
+ ]
94
+ parquet_elapsed_times = [
95
+ parquet_generate_elapsed_time.total_seconds(),
96
+ parquet_load_elapsed_time.total_seconds(),
97
+ ]
98
+ arrow_elapsed_times = [
99
+ arrow_generate_elapsed_time.total_seconds(),
100
+ arrow_load_elapsed_time.total_seconds(),
101
+ ]
102
+
103
+ y = np.arange(len(labels))
104
+ width = 0.35
105
+
106
+ competities = [
107
+ ['csv', 'CSV', csv_elapsed_times],
108
+ ['numpy', 'NumPy', npy_elapsed_times],
109
+ ['apache-parquet', 'Apache Parquet', parquet_elapsed_times],
110
+ ]
111
+ for id, label, elapsed_times in competities:
112
+ fig, ax = plt.subplots()
113
+ ax.barh(y - width / 2, elapsed_times, width, label=label)
114
+ ax.barh(y + width / 2, arrow_elapsed_times, width, label='Apache Arrow')
115
+
116
+ ax.set_xlabel(f'Elapsed time (second) ({n_records} records) (Shorter is faster)')
117
+ ax.set_title('Apache Arrow improves data interchange performance')
118
+ ax.set_yticks(y)
119
+ ax.set_yticklabels(labels)
120
+ ax.invert_yaxis()
121
+ ax.legend()
122
+
123
+ fig.tight_layout()
124
+
125
+ fig.savefig(f'images/benchmark-data-interchange-apache-arrow-{id}.svg')
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rabbit-slide-kou-scipy-japan-2020
3
+ version: !ruby/object:Gem::Version
4
+ version: 2020.10.30.0
5
+ platform: ruby
6
+ authors:
7
+ - Sutou Kouhei
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-24 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rabbit
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 2.0.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 2.0.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: rabbit-theme-clear-code
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: |-
42
+ Apache Arrow is a cross-language development platform for in-memory
43
+ data. You can use Apache Arrow to process large data effectively in
44
+ Python and other languages such as R. Apache Arrow is the future of
45
+ data processing. Apache Arrow 1.0, the first major version, was
46
+ released at 2020-07-24. It's a good time to know Apache Arrow and
47
+ start using it.
48
+ email:
49
+ - kou@clear-code.com
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - ".rabbit"
55
+ - README.rd
56
+ - Rakefile
57
+ - Rplots.pdf
58
+ - apache-arrow-1.0.rab
59
+ - config.yaml
60
+ - images/amazon-athena-improvement.svg
61
+ - images/apache-arrow-and-amazon-athena.svg
62
+ - images/apache-arrow-and-apache-parquet.svg
63
+ - images/apache-arrow-and-apache-spark.svg
64
+ - images/apache-arrow-and-data-interchange.svg
65
+ - images/apache-spark-improvement.svg
66
+ - images/arrow.svg
67
+ - images/benchmark-data-interchange-apache-arrow-apache-parquet.svg
68
+ - images/benchmark-data-interchange-apache-arrow-csv.svg
69
+ - images/benchmark-data-interchange-apache-arrow-numpy.svg
70
+ - images/contributor.png
71
+ - images/memory-mapping.svg
72
+ - images/noun_File_3524817.svg
73
+ - images/noun_Memory_2294239.svg
74
+ - images/parquet.svg
75
+ - images/record-batch.svg
76
+ - pdf/scipy-japan-2020-apache-arrow-1.0.pdf
77
+ - theme.rb
78
+ - tools/amazon-athena-improvement.R
79
+ - tools/apache-spark-improvement.R
80
+ - tools/benchmark-data-interchange.py
81
+ homepage: https://slide.rabbit-shocker.org/authors/kou/scipy-japan-2020/
82
+ licenses:
83
+ - CC-BY-SA-4.0
84
+ metadata: {}
85
+ post_install_message:
86
+ rdoc_options: []
87
+ require_paths:
88
+ - lib
89
+ required_ruby_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ required_rubygems_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ requirements: []
100
+ rubygems_version: 3.2.0.rc.1
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: Apache Arrow 1.0 - A cross-language development platform for in-memory data
104
+ test_files: []