cassback 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +14 -0
- data/.rubocop.yml_disabled +37 -0
- data/Gemfile +8 -0
- data/LICENSE +194 -0
- data/README.md +103 -0
- data/Rakefile.rb +8 -0
- data/bin/cassback +33 -5
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/cassback.gemspec +30 -0
- data/conf/local.yml +18 -0
- data/conf/preprod.yml +15 -0
- data/conf/prod.yml +15 -0
- data/lib/backuptool.rb +53 -2
- data/lib/cassandra.rb +15 -0
- data/lib/cassback/version.rb +3 -0
- data/lib/hadoop.rb +4 -1
- data/scripts/deploy.sh +3 -0
- data/scripts/manualbackups/ansible.cfg +12 -0
- data/scripts/manualbackups/inventory.txt +18 -0
- data/scripts/manualbackups/play_book.sh +13 -0
- data/scripts/manualbackups/playbooks/backups.yml +6 -0
- data/scripts/manualbackups/roles/planb/files/backup.sh +27 -0
- data/scripts/manualbackups/roles/planb/files/httpfs.sh +27 -0
- data/scripts/manualbackups/roles/planb/files/krb5.conf +26 -0
- data/scripts/manualbackups/roles/planb/tasks/main.yml +34 -0
- data/scripts/pre-push +17 -0
- data/test/cassandra_stub.rb +33 -0
- data/test/hadoop_stub.rb +51 -0
- data/test/test_backuptool.rb +180 -0
- metadata +33 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2b9b6aed95f39752afe6c7df4d2e404d4041450
|
4
|
+
data.tar.gz: 07e9fe1a67dd830ce2f45fc56b37098dbdcb01e9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 08080fa50589f745652230d2c5879406ac0fcf1f4cee3306c89ff963c18c1208fa430dca5cef1e7b22a4f06dcb80746a4d1619c6cc622dc7cd5763bcea082eed
|
7
|
+
data.tar.gz: b8196fe75585a33d1224fe6cd919a14b9fb90bef4dcc7ead9097f65ad9ffc6afa1b4a60b7a56408bf037f141ad5837f465c535fc0e0e173be162e3dc4a8230a9
|
data/.gitignore
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# This configuration was made for rubocop >= 0.36.0
|
2
|
+
|
3
|
+
### SRE Core configuration
|
4
|
+
### (See also https://confluence.criteois.com/pages/viewpage.action?pageId=270467645)
|
5
|
+
# Taken from Core's rules
|
6
|
+
Metrics/LineLength:
|
7
|
+
Max: 120
|
8
|
+
# Taken from Core's rules
|
9
|
+
Style/AlignHash:
|
10
|
+
EnforcedColonStyle: table
|
11
|
+
EnforcedHashRocketStyle: table
|
12
|
+
|
13
|
+
### SRE Storage configuration
|
14
|
+
# We have french people's names lying around
|
15
|
+
Style/AsciiComments:
|
16
|
+
Enabled: false
|
17
|
+
# This wants snake_case file names and we have dashes everywhere
|
18
|
+
Style/FileName:
|
19
|
+
Enabled: false
|
20
|
+
# Use consistent style for hashes (do not indent far away when in parentheses, etc.)
|
21
|
+
Style/IndentHash:
|
22
|
+
EnforcedStyle: consistent
|
23
|
+
# Enforce trailing commas in literals for consistency, ease of edition, and code generation
|
24
|
+
Style/TrailingCommaInLiteral:
|
25
|
+
EnforcedStyleForMultiline: comma
|
26
|
+
|
27
|
+
## Temporary edits (that should be fixed before enabling them)
|
28
|
+
# Messes things up for now
|
29
|
+
Style/BracesAroundHashParameters:
|
30
|
+
Enabled: false
|
31
|
+
# Badly implemented, and crashes in some cases
|
32
|
+
Performance/Casecmp:
|
33
|
+
Enabled: false
|
34
|
+
# We should have trailing commas only inside multiline statements
|
35
|
+
# r.veznaver said this one will be fixed in rubocop
|
36
|
+
Style/TrailingCommaInArguments:
|
37
|
+
Enabled: false
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
|
2
|
+
Apache License
|
3
|
+
Version 2.0, January 2004
|
4
|
+
http://www.apache.org/licenses/
|
5
|
+
|
6
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
7
|
+
|
8
|
+
1. Definitions.
|
9
|
+
|
10
|
+
"License" shall mean the terms and conditions for use, reproduction, and
|
11
|
+
distribution as defined by Sections 1 through 9 of this document.
|
12
|
+
|
13
|
+
"Licensor" shall mean the copyright owner or entity authorized by the copyright
|
14
|
+
owner that is granting the License.
|
15
|
+
|
16
|
+
"Legal Entity" shall mean the union of the acting entity and all other entities
|
17
|
+
that control, are controlled by, or are under common control with that entity.
|
18
|
+
For the purposes of this definition, "control" means (i) the power, direct or
|
19
|
+
indirect, to cause the direction or management of such entity, whether by
|
20
|
+
contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22
|
+
|
23
|
+
"You" (or "Your") shall mean an individual or Legal Entity exercising
|
24
|
+
permissions granted by this License.
|
25
|
+
|
26
|
+
"Source" form shall mean the preferred form for making modifications, including
|
27
|
+
but not limited to software source code, documentation source, and configuration
|
28
|
+
files.
|
29
|
+
|
30
|
+
"Object" form shall mean any form resulting from mechanical transformation or
|
31
|
+
translation of a Source form, including but not limited to compiled object code,
|
32
|
+
generated documentation, and conversions to other media types.
|
33
|
+
|
34
|
+
"Work" shall mean the work of authorship, whether in Source or Object form, made
|
35
|
+
available under the License, as indicated by a copyright notice that is included
|
36
|
+
in or attached to the work (an example is provided in the Appendix below).
|
37
|
+
|
38
|
+
"Derivative Works" shall mean any work, whether in Source or Object form, that
|
39
|
+
is based on (or derived from) the Work and for which the editorial revisions,
|
40
|
+
annotations, elaborations, or other modifications represent, as a whole, an
|
41
|
+
original work of authorship. For the purposes of this License, Derivative Works
|
42
|
+
shall not include works that remain separable from, or merely link (or bind by
|
43
|
+
name) to the interfaces of, the Work and Derivative Works thereof.
|
44
|
+
|
45
|
+
"Contribution" shall mean any work of authorship, including the original version
|
46
|
+
of the Work and any modifications or additions to that Work or Derivative Works
|
47
|
+
thereof, that is intentionally submitted to Licensor for inclusion in the Work
|
48
|
+
by the copyright owner or by an individual or Legal Entity authorized to submit
|
49
|
+
on behalf of the copyright owner. For the purposes of this definition,
|
50
|
+
"submitted" means any form of electronic, verbal, or written communication sent
|
51
|
+
to the Licensor or its representatives, including but not limited to
|
52
|
+
communication on electronic mailing lists, source code control systems, and
|
53
|
+
issue tracking systems that are managed by, or on behalf of, the Licensor for
|
54
|
+
the purpose of discussing and improving the Work, but excluding communication
|
55
|
+
that is conspicuously marked or otherwise designated in writing by the copyright
|
56
|
+
owner as "Not a Contribution."
|
57
|
+
|
58
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity on behalf
|
59
|
+
of whom a Contribution has been received by Licensor and subsequently
|
60
|
+
incorporated within the Work.
|
61
|
+
|
62
|
+
2. Grant of Copyright License.
|
63
|
+
|
64
|
+
Subject to the terms and conditions of this License, each Contributor hereby
|
65
|
+
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
66
|
+
irrevocable copyright license to reproduce, prepare Derivative Works of,
|
67
|
+
publicly display, publicly perform, sublicense, and distribute the Work and such
|
68
|
+
Derivative Works in Source or Object form.
|
69
|
+
|
70
|
+
3. Grant of Patent License.
|
71
|
+
|
72
|
+
Subject to the terms and conditions of this License, each Contributor hereby
|
73
|
+
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
|
74
|
+
irrevocable (except as stated in this section) patent license to make, have
|
75
|
+
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
|
76
|
+
such license applies only to those patent claims licensable by such Contributor
|
77
|
+
that are necessarily infringed by their Contribution(s) alone or by combination
|
78
|
+
of their Contribution(s) with the Work to which such Contribution(s) was
|
79
|
+
submitted. If You institute patent litigation against any entity (including a
|
80
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
|
81
|
+
Contribution incorporated within the Work constitutes direct or contributory
|
82
|
+
patent infringement, then any patent licenses granted to You under this License
|
83
|
+
for that Work shall terminate as of the date such litigation is filed.
|
84
|
+
|
85
|
+
4. Redistribution.
|
86
|
+
|
87
|
+
You may reproduce and distribute copies of the Work or Derivative Works thereof
|
88
|
+
in any medium, with or without modifications, and in Source or Object form,
|
89
|
+
provided that You meet the following conditions:
|
90
|
+
|
91
|
+
You must give any other recipients of the Work or Derivative Works a copy of
|
92
|
+
this License; and
|
93
|
+
You must cause any modified files to carry prominent notices stating that You
|
94
|
+
changed the files; and
|
95
|
+
You must retain, in the Source form of any Derivative Works that You distribute,
|
96
|
+
all copyright, patent, trademark, and attribution notices from the Source form
|
97
|
+
of the Work, excluding those notices that do not pertain to any part of the
|
98
|
+
Derivative Works; and
|
99
|
+
If the Work includes a "NOTICE" text file as part of its distribution, then any
|
100
|
+
Derivative Works that You distribute must include a readable copy of the
|
101
|
+
attribution notices contained within such NOTICE file, excluding those notices
|
102
|
+
that do not pertain to any part of the Derivative Works, in at least one of the
|
103
|
+
following places: within a NOTICE text file distributed as part of the
|
104
|
+
Derivative Works; within the Source form or documentation, if provided along
|
105
|
+
with the Derivative Works; or, within a display generated by the Derivative
|
106
|
+
Works, if and wherever such third-party notices normally appear. The contents of
|
107
|
+
the NOTICE file are for informational purposes only and do not modify the
|
108
|
+
License. You may add Your own attribution notices within Derivative Works that
|
109
|
+
You distribute, alongside or as an addendum to the NOTICE text from the Work,
|
110
|
+
provided that such additional attribution notices cannot be construed as
|
111
|
+
modifying the License.
|
112
|
+
You may add Your own copyright statement to Your modifications and may provide
|
113
|
+
additional or different license terms and conditions for use, reproduction, or
|
114
|
+
distribution of Your modifications, or for any such Derivative Works as a whole,
|
115
|
+
provided Your use, reproduction, and distribution of the Work otherwise complies
|
116
|
+
with the conditions stated in this License.
|
117
|
+
|
118
|
+
5. Submission of Contributions.
|
119
|
+
|
120
|
+
Unless You explicitly state otherwise, any Contribution intentionally submitted
|
121
|
+
for inclusion in the Work by You to the Licensor shall be under the terms and
|
122
|
+
conditions of this License, without any additional terms or conditions.
|
123
|
+
Notwithstanding the above, nothing herein shall supersede or modify the terms of
|
124
|
+
any separate license agreement you may have executed with Licensor regarding
|
125
|
+
such Contributions.
|
126
|
+
|
127
|
+
6. Trademarks.
|
128
|
+
|
129
|
+
This License does not grant permission to use the trade names, trademarks,
|
130
|
+
service marks, or product names of the Licensor, except as required for
|
131
|
+
reasonable and customary use in describing the origin of the Work and
|
132
|
+
reproducing the content of the NOTICE file.
|
133
|
+
|
134
|
+
7. Disclaimer of Warranty.
|
135
|
+
|
136
|
+
Unless required by applicable law or agreed to in writing, Licensor provides the
|
137
|
+
Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
|
138
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
|
139
|
+
including, without limitation, any warranties or conditions of TITLE,
|
140
|
+
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
|
141
|
+
solely responsible for determining the appropriateness of using or
|
142
|
+
redistributing the Work and assume any risks associated with Your exercise of
|
143
|
+
permissions under this License.
|
144
|
+
|
145
|
+
8. Limitation of Liability.
|
146
|
+
|
147
|
+
In no event and under no legal theory, whether in tort (including negligence),
|
148
|
+
contract, or otherwise, unless required by applicable law (such as deliberate
|
149
|
+
and grossly negligent acts) or agreed to in writing, shall any Contributor be
|
150
|
+
liable to You for damages, including any direct, indirect, special, incidental,
|
151
|
+
or consequential damages of any character arising as a result of this License or
|
152
|
+
out of the use or inability to use the Work (including but not limited to
|
153
|
+
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
|
154
|
+
any and all other commercial damages or losses), even if such Contributor has
|
155
|
+
been advised of the possibility of such damages.
|
156
|
+
|
157
|
+
9. Accepting Warranty or Additional Liability.
|
158
|
+
|
159
|
+
While redistributing the Work or Derivative Works thereof, You may choose to
|
160
|
+
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
|
161
|
+
other liability obligations and/or rights consistent with this License. However,
|
162
|
+
in accepting such obligations, You may act only on Your own behalf and on Your
|
163
|
+
sole responsibility, not on behalf of any other Contributor, and only if You
|
164
|
+
agree to indemnify, defend, and hold each Contributor harmless for any liability
|
165
|
+
incurred by, or claims asserted against, such Contributor by reason of your
|
166
|
+
accepting any such warranty or additional liability.
|
167
|
+
|
168
|
+
END OF TERMS AND CONDITIONS
|
169
|
+
|
170
|
+
APPENDIX: How to apply the Apache License to your work
|
171
|
+
|
172
|
+
To apply the Apache License to your work, attach the following boilerplate
|
173
|
+
notice, with the fields enclosed by brackets "{}" replaced with your own
|
174
|
+
identifying information. (Don't include the brackets!) The text should be
|
175
|
+
enclosed in the appropriate comment syntax for the file format. We also
|
176
|
+
recommend that a file or class name and description of purpose be included on
|
177
|
+
the same "printed page" as the copyright notice for easier identification within
|
178
|
+
third-party archives.
|
179
|
+
|
180
|
+
Copyright {yyyy} {name of copyright owner}
|
181
|
+
|
182
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
183
|
+
you may not use this file except in compliance with the License.
|
184
|
+
You may obtain a copy of the License at
|
185
|
+
|
186
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
187
|
+
|
188
|
+
Unless required by applicable law or agreed to in writing, software
|
189
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
190
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
191
|
+
See the License for the specific language governing permissions and
|
192
|
+
limitations under the License.
|
193
|
+
|
194
|
+
|
data/README.md
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# Cassback
|
2
|
+
|
3
|
+
Welcome to your Cassback!
|
4
|
+
This is a project that aims backup Cassandra SSTables and load them into HDFS for further usage.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Build the application into a gem using the command
|
9
|
+
|
10
|
+
$ gem build cassback.gemspec
|
11
|
+
|
12
|
+
You should the following output :
|
13
|
+
|
14
|
+
Successfully built RubyGem
|
15
|
+
Name: cassback
|
16
|
+
Version: 0.1.0
|
17
|
+
File: cassback-0.1.0.gem
|
18
|
+
|
19
|
+
|
20
|
+
Install the application into your local gem store using the following command :
|
21
|
+
|
22
|
+
$ gem install cassback-0.1.0.gem
|
23
|
+
|
24
|
+
You should then see the following output :
|
25
|
+
|
26
|
+
Successfully installed cassback-0.1.0
|
27
|
+
Parsing documentation for cassback-0.1.0
|
28
|
+
Done installing documentation for cassback after 0 seconds
|
29
|
+
1 gem installed
|
30
|
+
|
31
|
+
## Usage
|
32
|
+
|
33
|
+
When the cassback gem installed it adds the **cassback** executable file into your PATH variable.
|
34
|
+
This means that you can execute it using one of the following commands and it will return example of usage :
|
35
|
+
|
36
|
+
cassback
|
37
|
+
cassback -h
|
38
|
+
|
39
|
+
A simple command that you can use for starting a backup is :
|
40
|
+
|
41
|
+
cassback -S -C path_to_some_config_file.yml
|
42
|
+
|
43
|
+
## Configuration
|
44
|
+
|
45
|
+
The application has some default configuration defined.
|
46
|
+
You can overwrite the default configuration using two meanings :
|
47
|
+
|
48
|
+
1. Using a configuration file passed as parameter on the command line.
|
49
|
+
|
50
|
+
2. Using individual configuration properties passed as parameters on the command line.
|
51
|
+
The command line parameters have precedence over the configuration file.
|
52
|
+
|
53
|
+
## Orchestration
|
54
|
+
|
55
|
+
The tool is designed to do snapshots at **node level** (and not at **cluster level**) - basically it has to be installed
|
56
|
+
on each node and a separate process will have to be executed from there to trigger a node level snapshot. Because this task is
|
57
|
+
quite complex it is recommended to use an orchestration tool (like Rundeck) that allows you to execute same command
|
58
|
+
on multiple machines and run the processes in parallel.
|
59
|
+
|
60
|
+
After all node backups are finished the orchestration tool will have to take care of signaling other applications that
|
61
|
+
the backup is completely finished. That is done now by adding a new empty file on the cluster metadata folder that has
|
62
|
+
the format BACKUP_COMPLETED_yyyy_MM_dd. This has to be triggered only once by using the following command :
|
63
|
+
|
64
|
+
cassback -B [-d date] -C conf/path_to_some_config_file.yml
|
65
|
+
|
66
|
+
Optionally you can also pass a date, if not present current day date will be assumed.
|
67
|
+
|
68
|
+
## Data Integrity
|
69
|
+
|
70
|
+
The project is using internally the webhdfs tool (see https://github.com/kzk/webhdfs) that is a Ruby project
|
71
|
+
built on top of the WebHDFS API (https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html).
|
72
|
+
Because we're using the WebHDFS API we get for free data integrity. The tool is also configurable so in case errors it
|
73
|
+
can retry the file download/upload of data. This is configurable via the following config file properties :
|
74
|
+
|
75
|
+
1. **hadoop.retryTimes** - the number of retries the tool should do before giving up. Default set to 5.
|
76
|
+
2. **hadoop.retryInterval** - the interval (in seconds) the tool should take between two attempts. Default set to 1 second.
|
77
|
+
|
78
|
+
If you want to check more about Hadoop's checksum algorithm that ensures data integrity you can check the
|
79
|
+
following link : https://www.safaribooksonline.com/library/view/hadoop-the-definitive/9781449328917/ch04.html
|
80
|
+
|
81
|
+
## Cleanup policy
|
82
|
+
|
83
|
+
Usually backups of databases take a lot of space. Even if we have optimized the code so the backups are done incrementally
|
84
|
+
(meaning that a file is not stored twice even if it's present in multiple backups), still cleanup needs to be done.
|
85
|
+
The tool has a cleanup policy of cleaning snapshots after some days have passed since the snapshot has been published.
|
86
|
+
This is configurable via the **cleanup.retentionDays** property in the configuration file. One point is that cleanup is
|
87
|
+
done at cluster level (for all nodes) since it doesn't make sense to keep data for only some of the nodes.
|
88
|
+
|
89
|
+
The command for triggering a cleanup is :
|
90
|
+
|
91
|
+
cassback -A -C conf/path_to_some_config_file.yml
|
92
|
+
|
93
|
+
# Unit tests
|
94
|
+
Unit tests can be executed locally by running the following command :
|
95
|
+
|
96
|
+
rake test
|
97
|
+
|
98
|
+
## Contributing
|
99
|
+
|
100
|
+
For now this is an internal Criteo project, but were aiming for making it open source and publishing to GitHub.
|
101
|
+
|
102
|
+
Issue reports and merge requests are welcome on Criteo's GitLab at : https://gitlab.criteois.com/ruby-gems/cassback
|
103
|
+
|
data/Rakefile.rb
ADDED
data/bin/cassback
CHANGED
@@ -30,6 +30,7 @@ command_line_config = {
|
|
30
30
|
'cassandra' => {},
|
31
31
|
'hadoop' => {},
|
32
32
|
'restore' => {},
|
33
|
+
'cleanup' => {},
|
33
34
|
}
|
34
35
|
|
35
36
|
# Default options
|
@@ -38,13 +39,19 @@ options = {
|
|
38
39
|
'config' => '/etc/cassandra/conf/cassandra.yaml',
|
39
40
|
},
|
40
41
|
'hadoop' => {
|
41
|
-
'hostname'
|
42
|
-
'port'
|
43
|
-
'directory'
|
42
|
+
'hostname' => 'localhost',
|
43
|
+
'port' => 14_000,
|
44
|
+
'directory' => 'cassandra',
|
45
|
+
'retryTimes' => 5,
|
46
|
+
'retryInterval' => 1,
|
44
47
|
},
|
45
48
|
'restore' => {
|
46
49
|
'destination' => 'cassandra',
|
47
50
|
},
|
51
|
+
|
52
|
+
'cleanup' => {
|
53
|
+
'retentionDays' => 30,
|
54
|
+
},
|
48
55
|
}
|
49
56
|
|
50
57
|
# If no argument given in command line, print the help
|
@@ -52,7 +59,7 @@ ARGV << '-h' if ARGV.empty?
|
|
52
59
|
|
53
60
|
# Parse command line options
|
54
61
|
parser = OptionParser.new do |opts|
|
55
|
-
opts.banner = 'Usage: cassback
|
62
|
+
opts.banner = 'Usage: cassback [options]'
|
56
63
|
|
57
64
|
opts.separator ''
|
58
65
|
opts.separator 'Configuration:'
|
@@ -74,6 +81,13 @@ parser = OptionParser.new do |opts|
|
|
74
81
|
opts.on('-F', '--flush', 'removes a backuped snapshot from Hadoop, needs a date') do |_v|
|
75
82
|
action = 'delete'
|
76
83
|
end
|
84
|
+
opts.on('-B', '--backupFlag', 'creates an empty file to signal that the backup has finished, can be used with a date, \
|
85
|
+
today date is assumed if no date is provided') do |_v|
|
86
|
+
action = 'backupFlag'
|
87
|
+
end
|
88
|
+
opts.on('-A', '--cleanup', 'cleans up old snapshots') do |_v|
|
89
|
+
action = 'cleanup'
|
90
|
+
end
|
77
91
|
|
78
92
|
opts.separator ''
|
79
93
|
opts.separator 'Action related:'
|
@@ -133,7 +147,9 @@ end
|
|
133
147
|
|
134
148
|
begin
|
135
149
|
# Create the Hadoop object
|
136
|
-
hadoop = Hadoop.new(host: options['hadoop']['hostname'], port: options['hadoop']['port'],
|
150
|
+
hadoop = Hadoop.new(host: options['hadoop']['hostname'], port: options['hadoop']['port'],
|
151
|
+
base_dir: options['hadoop']['directory'], retry_times: options['hadoop']['retryTimes'],
|
152
|
+
retry_interval: options['hadoop']['retryInterval'])
|
137
153
|
|
138
154
|
# Create the Cassandra object
|
139
155
|
cassandra = Cassandra.new(options['cassandra']['config'], logger)
|
@@ -161,6 +177,18 @@ begin
|
|
161
177
|
elsif action == 'delete'
|
162
178
|
raise('No date given') unless options.include? 'date'
|
163
179
|
bck.delete_snapshots(node: options['node'], date: options['date'])
|
180
|
+
|
181
|
+
# Create backup flag.
|
182
|
+
elsif action == 'backupFlag'
|
183
|
+
# Use today's date if no date has been provided
|
184
|
+
date = options['date']
|
185
|
+
date ||= Time.new.strftime('%Y_%m_%d')
|
186
|
+
bck.create_backup_flag(date)
|
187
|
+
|
188
|
+
# Cleanup old snapshots based on cleanup.retentionDays
|
189
|
+
elsif action == 'cleanup'
|
190
|
+
days = options['cleanup']['retentionDays'].to_i
|
191
|
+
bck.cleanup(days)
|
164
192
|
end
|
165
193
|
|
166
194
|
# In case of failure
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'cassback'
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require 'irb'
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/cassback.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'cassback/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'cassback'
|
8
|
+
spec.version = Cassback::VERSION
|
9
|
+
spec.authors = ['Vincent Van Hollebeke', 'Bogdan Niculescu']
|
10
|
+
spec.email = ['v.vanhollebeke@criteo.com', 'b.niculescu@criteo.com']
|
11
|
+
|
12
|
+
spec.summary = 'Cassandra backup to HDFS.'
|
13
|
+
spec.description = 'This is a tool that allows creating backups of Cassandra and pushing them into HDFS.'
|
14
|
+
spec.homepage = 'http://rubygems.org/gems/cassback'
|
15
|
+
|
16
|
+
spec.licenses = ['Apache-2.0']
|
17
|
+
|
18
|
+
spec.files = `git ls-files`.split("\n")
|
19
|
+
spec.test_files = `git ls-files -- test/*`.split("\n")
|
20
|
+
spec.bindir = 'bin'
|
21
|
+
spec.executables << 'cassback'
|
22
|
+
spec.require_paths = ['lib']
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.11'
|
25
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
26
|
+
|
27
|
+
spec.add_runtime_dependency 'gssapi', '~> 1.2', '>= 1.2.0'
|
28
|
+
spec.add_runtime_dependency 'webhdfs', '~> 0.8', '>= 0.8.0'
|
29
|
+
spec.add_runtime_dependency 'table_print', '~> 1.5', '>= 1.5.6'
|
30
|
+
end
|
data/conf/local.yml
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
cassandra:
|
2
|
+
# config: "/etc/cassandra/conf/cassandra.yaml"
|
3
|
+
config: "/Users/b.niculescu/Tools/apache-cassandra-2.0.16/conf/cassandra.yaml"
|
4
|
+
|
5
|
+
hadoop:
|
6
|
+
# hostname: "10.60.34.217"
|
7
|
+
hostname: "jobs-user.hpc.criteo.prod"
|
8
|
+
port: 14000
|
9
|
+
# directory: "/user/v.vanhollebeke/cassandra"
|
10
|
+
directory: "/tmp/b.niculescu/cassandra"
|
11
|
+
retryTimes : 3
|
12
|
+
retryInterval : 1
|
13
|
+
|
14
|
+
restore:
|
15
|
+
destination: "cassback_restore"
|
16
|
+
|
17
|
+
cleanup:
|
18
|
+
retentionDays: 30
|
data/conf/preprod.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
cassandra:
|
2
|
+
config: "/etc/cassandra/conf/cassandra.yaml"
|
3
|
+
|
4
|
+
hadoop:
|
5
|
+
hostname: "jobs-user.hpc.criteo.preprod"
|
6
|
+
port: 14000
|
7
|
+
directory: "/tmp/cassandraback/preprod/"
|
8
|
+
retryTimes : 5
|
9
|
+
retryInterval : 1
|
10
|
+
|
11
|
+
restore:
|
12
|
+
destination: "cassback_restore"
|
13
|
+
|
14
|
+
cleanup:
|
15
|
+
retentionDays: 30
|
data/conf/prod.yml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
cassandra:
|
2
|
+
config: "/etc/cassandra/conf/cassandra.yaml"
|
3
|
+
|
4
|
+
hadoop:
|
5
|
+
hostname: "jobs-user.hpc.criteo.prod"
|
6
|
+
port: 14000
|
7
|
+
directory: "/tmp/cassandrabackups/prod/"
|
8
|
+
retryTimes : 5
|
9
|
+
retryInterval : 1
|
10
|
+
|
11
|
+
restore:
|
12
|
+
destination: "cassback_restore"
|
13
|
+
|
14
|
+
cleanup:
|
15
|
+
retentionDays: 30
|
data/lib/backuptool.rb
CHANGED
@@ -40,7 +40,8 @@ class BackupTool
|
|
40
40
|
begin
|
41
41
|
if date == 'ALL'
|
42
42
|
ls = @hadoop.list("#{@hadoop.base_dir}/#{@metadir}/#{@cassandra.cluster_name}/#{node}")
|
43
|
-
ls.
|
43
|
+
ls_metadata = ls.select { |item| item['pathSuffix'].include? 'cass_snap_' }
|
44
|
+
ls_metadata.each do |item|
|
44
45
|
date = item['pathSuffix'].gsub('cass_snap_', '')
|
45
46
|
metadata = get_snapshot_metadata(node, date)
|
46
47
|
snapshot = CassandraSnapshot.new(@cassandra.cluster_name, node, date, metadata)
|
@@ -60,7 +61,8 @@ class BackupTool
|
|
60
61
|
if node == 'ALL'
|
61
62
|
begin
|
62
63
|
ls = @hadoop.list("#{@hadoop.base_dir}/#{@metadir}/#{@cassandra.cluster_name}")
|
63
|
-
ls.
|
64
|
+
ls_nodes = ls.select { |item| item['type'].casecmp('DIRECTORY') == 0 }
|
65
|
+
ls_nodes.each do |item|
|
64
66
|
n = item['pathSuffix']
|
65
67
|
result += get_snapshots_node(n, date)
|
66
68
|
end
|
@@ -141,6 +143,55 @@ class BackupTool
|
|
141
143
|
end
|
142
144
|
end
|
143
145
|
|
146
|
+
# Cleans up backups that are older than a number of days.
|
147
|
+
# This functions cleans data on all nodes.
|
148
|
+
def cleanup(days)
|
149
|
+
retention_date = Date.today - days
|
150
|
+
@logger.info("Cleaning backup data on all nodes before #{retention_date}.")
|
151
|
+
|
152
|
+
all_snapshots = search_snapshots
|
153
|
+
@logger.info("A total of #{all_snapshots.size} snapshots were found on Hadoop server.")
|
154
|
+
|
155
|
+
snapshots_to_be_deleted = all_snapshots.select { |snapshot| snapshot.get_date < retention_date }
|
156
|
+
@logger.info("A total of #{snapshots_to_be_deleted.size} snapshots will be deleted.")
|
157
|
+
|
158
|
+
snapshots_to_be_deleted.each do |snapshot|
|
159
|
+
delete_snapshots(node: snapshot.node, date: snapshot.date)
|
160
|
+
end
|
161
|
+
|
162
|
+
all_backup_flags = get_backup_flags
|
163
|
+
@logger.info("A total of #{all_backup_flags.size} back up flags were found on Hadoop server.")
|
164
|
+
|
165
|
+
backup_flags_to_be_delete = all_backup_flags.select { |flag| flag.date < retention_date }
|
166
|
+
@logger.info("A total of #{backup_flags_to_be_delete.size} backup flags will be deleted.")
|
167
|
+
|
168
|
+
backup_flags_location = @hadoop.base_dir + '/' + @metadir + '/' + @cassandra.cluster_name
|
169
|
+
backup_flags_to_be_delete.each do |flag|
|
170
|
+
file = backup_flags_location + '/' + flag.file
|
171
|
+
@logger.info("Deleting #{file}")
|
172
|
+
@hadoop.delete(file)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
# Method that creates a backup flag to signal that the backup is finished on all nodes
|
177
|
+
# This is an individual command that has to be called manually after snapshots have finished
|
178
|
+
def create_backup_flag(date)
|
179
|
+
file_name = 'BACKUP_COMPLETED_' + date
|
180
|
+
remote_file = @hadoop.base_dir + '/' + @metadir + '/' + @cassandra.cluster_name + '/' + file_name
|
181
|
+
|
182
|
+
@logger.info('Setting backup completed flag : ' + remote_file)
|
183
|
+
@hadoop.create(remote_file, '', overwrite: true)
|
184
|
+
end
|
185
|
+
|
186
|
+
def get_backup_flags
|
187
|
+
backup_flags_location = @hadoop.base_dir + '/' + @metadir + '/' + @cassandra.cluster_name
|
188
|
+
ls = @hadoop.list(backup_flags_location)
|
189
|
+
backup_flags = ls.select { |item| item['pathSuffix'].include? 'BACKUP_COMPLETED_' }
|
190
|
+
backup_flags.collect do |file|
|
191
|
+
BackupFlag.new(@cassandra.cluster_name, file['pathSuffix'])
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
144
195
|
# Download a file from HDFS, buffered way
|
145
196
|
# * *Args* :
|
146
197
|
# - +remote+ -> HDFS path
|
data/lib/cassandra.rb
CHANGED
@@ -150,4 +150,19 @@ class CassandraSnapshot
|
|
150
150
|
d = @date <=> other.date
|
151
151
|
c * 3 + n * 2 + d
|
152
152
|
end
|
153
|
+
|
154
|
+
def get_date
|
155
|
+
DateTime.strptime(@date, '%Y_%m_%d')
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
class BackupFlag
|
160
|
+
attr_reader :cluster, :date, :file
|
161
|
+
|
162
|
+
def initialize(cluster, file)
|
163
|
+
@cluster = cluster
|
164
|
+
@file = file.dup
|
165
|
+
date_as_string = file.sub! 'BACKUP_COMPLETED_', ''
|
166
|
+
@date = DateTime.strptime(date_as_string, '%Y_%m_%d')
|
167
|
+
end
|
153
168
|
end
|
data/lib/hadoop.rb
CHANGED
@@ -6,9 +6,12 @@ WebHDFS::ClientV1::REDIRECTED_OPERATIONS.delete('OPEN')
|
|
6
6
|
class Hadoop < WebHDFS::Client
|
7
7
|
attr_reader :base_dir
|
8
8
|
|
9
|
-
def initialize(host: 'localhost', port: 14_000, base_dir: '/')
|
9
|
+
def initialize(host: 'localhost', port: 14_000, base_dir: '/', retry_times: 5, retry_interval: 1)
|
10
10
|
super(host = host, port = port)
|
11
11
|
@kerberos = true
|
12
12
|
@base_dir = base_dir
|
13
|
+
@retry_known_errors = true
|
14
|
+
@retry_times = retry_times
|
15
|
+
@retry_interval = retry_interval
|
13
16
|
end
|
14
17
|
end
|
data/scripts/deploy.sh
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
[cstars02-par]
|
2
|
+
cstars02e01-par ansible_ssh_host="cstars02e01-par.storage.criteo.prod"
|
3
|
+
cstars02e02-par ansible_ssh_host="cstars02e02-par.storage.criteo.prod"
|
4
|
+
cstars02e03-par ansible_ssh_host="cstars02e03-par.storage.criteo.prod"
|
5
|
+
cstars02e04-par ansible_ssh_host="cstars02e04-par.storage.criteo.prod"
|
6
|
+
cstars02e05-par ansible_ssh_host="cstars02e05-par.storage.criteo.prod"
|
7
|
+
cstars02e06-par ansible_ssh_host="cstars02e06-par.storage.criteo.prod"
|
8
|
+
cstars02e07-par ansible_ssh_host="cstars02e07-par.storage.criteo.prod"
|
9
|
+
cstars02e08-par ansible_ssh_host="cstars02e08-par.storage.criteo.prod"
|
10
|
+
cstars02e09-par ansible_ssh_host="cstars02e09-par.storage.criteo.prod"
|
11
|
+
cstars02e10-par ansible_ssh_host="cstars02e10-par.storage.criteo.prod"
|
12
|
+
cstars02e11-par ansible_ssh_host="cstars02e11-par.storage.criteo.prod"
|
13
|
+
cstars02e12-par ansible_ssh_host="cstars02e12-par.storage.criteo.prod"
|
14
|
+
cstars02e13-par ansible_ssh_host="cstars02e13-par.storage.criteo.prod"
|
15
|
+
cstars02e14-par ansible_ssh_host="cstars02e14-par.storage.criteo.prod"
|
16
|
+
cstars02e15-par ansible_ssh_host="cstars02e15-par.storage.criteo.prod"
|
17
|
+
cstars02e16-par ansible_ssh_host="cstars02e16-par.storage.criteo.prod"
|
18
|
+
cstars02e17-par ansible_ssh_host="cstars02e17-par.storage.criteo.prod"
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
kinit v.vanhollebeke@CRITEOIS.LAN -k -t ~/keytab
|
4
|
+
|
5
|
+
date=`date +%Y_%m_%d`
|
6
|
+
|
7
|
+
nodetool clearsnapshot
|
8
|
+
|
9
|
+
snapdir=$(nodetool snapshot| grep directory| awk '{print $NF}')
|
10
|
+
echo "Snapshot is $snapdir"
|
11
|
+
|
12
|
+
for dir in $(find /var/opt/cassandra/data -type d |grep snapshots/$snapdir); do
|
13
|
+
kok=$(klist -l|grep v.vanhollebeke@CRITEOIS.LAN|grep -v Expired|wc -l)
|
14
|
+
if [ $kok == 0 ]; then
|
15
|
+
echo "Must renew Kerberos ticket"
|
16
|
+
kinit v.vanhollebeke@CRITEOIS.LAN -k -t ~/keytab
|
17
|
+
else
|
18
|
+
echo "Kerberos ticket OK"
|
19
|
+
fi
|
20
|
+
keyspace=`echo $dir|awk -F\/ '{print $6}'`
|
21
|
+
table=`echo $dir|awk -F\/ '{print $7}'`
|
22
|
+
echo "Saving $keyspace $table"
|
23
|
+
./httpfs.sh /var/opt/cassandra/data/$keyspace/$table/snapshots/$snapdir tmp/cassandrabackups/prod/cstars02/$date/$HOSTNAME/$table
|
24
|
+
|
25
|
+
done
|
26
|
+
|
27
|
+
echo "FINISHED !!!!"
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
|
3
|
+
BASE='http://0.httpfs.hpc.criteo.prod:14000/webhdfs/v1'
|
4
|
+
#BASE='http://httpfs.pa4.hpc.criteo.prod:14000'
|
5
|
+
|
6
|
+
IN=$1
|
7
|
+
OUT=$2
|
8
|
+
|
9
|
+
echo "Creating destination directory: $OUT"
|
10
|
+
curl --negotiate -u : "$BASE/$OUT?op=MKDIRS&permission=0777" -X PUT -s > /dev/null
|
11
|
+
|
12
|
+
for p in $(find $IN -type f)
|
13
|
+
do
|
14
|
+
f=$(basename $p)
|
15
|
+
echo "$IN/$f"
|
16
|
+
|
17
|
+
# Create file
|
18
|
+
dest=$(curl --negotiate -u : "$BASE/$OUT/$f?op=CREATE&overwrite=true&permission=0777" -i -X PUT -s | grep Location | tail -n1 | cut -d\ -f2 | tr -d '\r\n')
|
19
|
+
[ $? != 0 ] && echo "ERROR"
|
20
|
+
|
21
|
+
echo "DEST IS ${dest}"
|
22
|
+
|
23
|
+
# Upload file
|
24
|
+
curl --negotiate -u : "$dest" -i -X PUT -T "$IN/$f" -H 'Content-Type: application/octet-stream' > /dev/null
|
25
|
+
[ $? != 0 ] && echo "ERROR"
|
26
|
+
|
27
|
+
done
|
@@ -0,0 +1,26 @@
|
|
1
|
+
[libdefaults]
|
2
|
+
dns_lookup_realm = true
|
3
|
+
dns_lookup_kdc = true
|
4
|
+
ticket_lifetime = 24h
|
5
|
+
renew_lifetime = 7d
|
6
|
+
forwardable = true
|
7
|
+
default_realm = CRITEOIS.LAN
|
8
|
+
udp_preference_limit = 1
|
9
|
+
realm_try_domains = 1
|
10
|
+
permitted_enctypes = aes128-cts-hmac-sha1-96 des3-cbc-sha1 arcfour-hmac
|
11
|
+
default_tkt_enctypes = aes128-cts-hmac-sha1-96 des3-cbc-sha1 arcfour-hmac
|
12
|
+
[domain_realm]
|
13
|
+
.hpc.criteo.preprod = HPC.CRITEO.PREPROD
|
14
|
+
.hpc.criteo.prod = AMS.HPC.CRITEO.PROD
|
15
|
+
.pa4.hpc.criteo.prod = PA4.HPC.CRITEO.PROD
|
16
|
+
.as.hpc.criteo.prod = AS.HPC.CRITEO.PROD
|
17
|
+
.na.hpc.criteo.prod = NA.HPC.CRITEO.PROD
|
18
|
+
.cn.hpc.criteo.prod = CN.HPC.CRITEO.PROD
|
19
|
+
[capaths]
|
20
|
+
CRITEOIS.LAN = {
|
21
|
+
AMS.HPC.CRITEO.PROD = .
|
22
|
+
PA4.HPC.CRITEO.PROD = AMS.HPC.CRITEO.PROD
|
23
|
+
AS.HPC.CRITEO.PROD = AMS.HPC.CRITEO.PROD
|
24
|
+
NA.HPC.CRITEO.PROD = AMS.HPC.CRITEO.PROD
|
25
|
+
CN.HPC.CRITEO.PROD = AMS.HPC.CRITEO.PROD
|
26
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
---
|
2
|
+
|
3
|
+
- name: Copy krb5.conf into /etc
|
4
|
+
copy: src=krb5.conf dest=/etc/krb5.conf
|
5
|
+
sudo: yes
|
6
|
+
tags: keytab
|
7
|
+
|
8
|
+
- name: Copy my keytab
|
9
|
+
copy: src=keytab dest=~/keytab
|
10
|
+
tags: keytab
|
11
|
+
|
12
|
+
- name: Check if keytab works
|
13
|
+
command: kinit $USER@CRITEOIS.LAN -k -t ~/keytab
|
14
|
+
tags: keytab
|
15
|
+
|
16
|
+
- name: Copy httpfs.sh script
|
17
|
+
copy: src=httpfs.sh dest=~/httpfs.sh mode=750
|
18
|
+
tags: backup
|
19
|
+
|
20
|
+
- name: Copy backup.sh script
|
21
|
+
copy: src=backup.sh dest=~/backup.sh mode=750
|
22
|
+
tags: backup
|
23
|
+
|
24
|
+
- name: Start Backup
|
25
|
+
shell: ./backup.sh >logfile 2>&1 chdir=~
|
26
|
+
tags: backup
|
27
|
+
|
28
|
+
- name: Clear snapshots
|
29
|
+
shell: sudo nodetool clearsnapshot
|
30
|
+
tags: clear
|
31
|
+
|
32
|
+
- name: Verify if snapshots are REALLY deleted
|
33
|
+
shell: "[ $(find /var/opt/cassandra -type d |grep snap|wc -l) == 0 ]"
|
34
|
+
tags: verify
|
data/scripts/pre-push
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
echo "Running rubocop with auto-correct" >&2
|
4
|
+
bundle exec rubocop --config .rubocop.yml --auto-correct --out /dev/null
|
5
|
+
modified=$(git status | grep modified | wc -l)
|
6
|
+
if [ $modified -eq 0 ]; then
|
7
|
+
echo -e "\e[1;32mNothing to correct, pushing\e[0m" >&2
|
8
|
+
exit 0
|
9
|
+
else
|
10
|
+
s=''
|
11
|
+
if [ $modified -gt 1 ]; then
|
12
|
+
s='s'
|
13
|
+
fi
|
14
|
+
|
15
|
+
echo -e "\e[1;31m$modified file$s were modified, please add commit before pushing\e[0m" >&2
|
16
|
+
exit 1
|
17
|
+
fi
|
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require_relative '../lib/cassandra'
|
3
|
+
|
4
|
+
# Stub implementation that simulates cassandra backups.
|
5
|
+
class CassandraStub
|
6
|
+
attr_reader :data_path, :cluster_name, :node_name
|
7
|
+
|
8
|
+
def initialize(cluster_name = 'cluster1', node_name = 'node1', date = '', file_indexes = [])
|
9
|
+
@cluster_name = cluster_name
|
10
|
+
@node_name = node_name
|
11
|
+
@date = date
|
12
|
+
@data_path = 'test/cassandra' + '/' + cluster_name + '/' + node_name + '/'
|
13
|
+
FileUtils.mkdir_p(@data_path)
|
14
|
+
|
15
|
+
# create some fake sstables
|
16
|
+
@metadata = Set.new
|
17
|
+
file_indexes.each do |index|
|
18
|
+
file_name = "SSTable-#{index}-Data.db"
|
19
|
+
file_path = @data_path + '/' + file_name
|
20
|
+
File.open(file_path, 'w') { |file| file.write('This is a test file that simulates an SSTable') }
|
21
|
+
@metadata.add(file_name)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def new_snapshot
|
26
|
+
# simple create a pointer to an existing location
|
27
|
+
CassandraSnapshot.new(@cluster_name, @node_name, @date, @metadata)
|
28
|
+
end
|
29
|
+
|
30
|
+
def delete_snapshot(_snapshot)
|
31
|
+
FileUtils.rm_rf(@data_path)
|
32
|
+
end
|
33
|
+
end
|
data/test/hadoop_stub.rb
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
|
5
|
+
# A stub implementation of Hadoop that read/writes to local file instead of using webhdfs
|
6
|
+
class HadoopStub
|
7
|
+
attr_reader :base_dir
|
8
|
+
|
9
|
+
def initialize(base_dir)
|
10
|
+
@base_dir = base_dir
|
11
|
+
end
|
12
|
+
|
13
|
+
def list(path, _options = {})
|
14
|
+
files_and_folders = Dir.glob("#{path}/*")
|
15
|
+
files_and_folders.collect do |file|
|
16
|
+
type = if File.file?(file)
|
17
|
+
'FILE'
|
18
|
+
else
|
19
|
+
'DIRECTORY'
|
20
|
+
end
|
21
|
+
# return a hash similar to the one that hadoop sends (containing fewer entries)
|
22
|
+
{
|
23
|
+
'pathSuffix' => File.basename(file),
|
24
|
+
'type' => type,
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def list_files(path, _options = {})
|
30
|
+
files_and_folders = Dir.glob("#{path}/**/*")
|
31
|
+
files_and_folders.select { |file| File.file?(file) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def create(path, body, _options = {})
|
35
|
+
parent = File.expand_path('..', path)
|
36
|
+
FileUtils.mkdir_p parent
|
37
|
+
if body.is_a?(File)
|
38
|
+
File.open(path, 'w') { |file| file.write(body.read) }
|
39
|
+
elsif
|
40
|
+
File.open(path, 'w') { |file| file.write(body) }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def read(path, _options = {})
|
45
|
+
File.open(path, 'r').read
|
46
|
+
end
|
47
|
+
|
48
|
+
def delete(path, _options = {})
|
49
|
+
FileUtils.rm_rf(path)
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,180 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'test/unit'
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
require_relative '../lib/backuptool'
|
6
|
+
require_relative 'hadoop_stub'
|
7
|
+
require_relative 'cassandra_stub'
|
8
|
+
|
9
|
+
class TestSimpleNumber < Test::Unit::TestCase
|
10
|
+
def test_new_snapshot
|
11
|
+
hadoop = HadoopStub.new('test/hadoop')
|
12
|
+
create_new_snapshot(hadoop, 'node1', '2016_04_22', [1, 2])
|
13
|
+
|
14
|
+
remote_files = hadoop.list_files('test/hadoop')
|
15
|
+
# two files were backed up + one metadata file
|
16
|
+
assert_equal(3, remote_files.size)
|
17
|
+
|
18
|
+
# files were created in the correct location
|
19
|
+
assert_equal('test/hadoop/cass_snap_metadata/cluster1/node1/cass_snap_2016_04_22', remote_files[0])
|
20
|
+
assert_equal('test/hadoop/cluster1/node1/SSTable-1-Data.db', remote_files[1])
|
21
|
+
assert_equal('test/hadoop/cluster1/node1/SSTable-2-Data.db', remote_files[2])
|
22
|
+
|
23
|
+
# metadata file contains the sstables.
|
24
|
+
metadata_content = File.open(remote_files[0], 'r').read
|
25
|
+
assert(metadata_content.include? 'SSTable-1-Data.db')
|
26
|
+
assert(metadata_content.include? 'SSTable-2-Data.db')
|
27
|
+
|
28
|
+
# cleanup
|
29
|
+
hadoop.delete('test/hadoop')
|
30
|
+
hadoop.delete('test/cassandra')
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_two_snapshots
|
34
|
+
hadoop = HadoopStub.new('test/hadoop')
|
35
|
+
create_new_snapshot(hadoop, 'node1', '2016_04_22', [1, 2])
|
36
|
+
create_new_snapshot(hadoop, 'node1', '2016_04_23', [2, 3, 4])
|
37
|
+
|
38
|
+
remote_files = hadoop.list_files('test/hadoop')
|
39
|
+
# two files were backed up + one metadata file
|
40
|
+
assert_equal(6, remote_files.size)
|
41
|
+
|
42
|
+
# files were created in the correct location
|
43
|
+
# no duplicate files are stored
|
44
|
+
assert_equal('test/hadoop/cass_snap_metadata/cluster1/node1/cass_snap_2016_04_22', remote_files[0])
|
45
|
+
assert_equal('test/hadoop/cass_snap_metadata/cluster1/node1/cass_snap_2016_04_23', remote_files[1])
|
46
|
+
assert_equal('test/hadoop/cluster1/node1/SSTable-1-Data.db', remote_files[2])
|
47
|
+
assert_equal('test/hadoop/cluster1/node1/SSTable-2-Data.db', remote_files[3])
|
48
|
+
assert_equal('test/hadoop/cluster1/node1/SSTable-3-Data.db', remote_files[4])
|
49
|
+
assert_equal('test/hadoop/cluster1/node1/SSTable-4-Data.db', remote_files[5])
|
50
|
+
|
51
|
+
# metadata on first backup file contains the sstables.
|
52
|
+
metadata_content = File.open(remote_files[0], 'r').read
|
53
|
+
assert(metadata_content.include? 'SSTable-1-Data.db')
|
54
|
+
assert(metadata_content.include? 'SSTable-2-Data.db')
|
55
|
+
|
56
|
+
# metadata on second backup file contains the sstables.
|
57
|
+
metadata_content = File.open(remote_files[1], 'r').read
|
58
|
+
assert(metadata_content.include? 'SSTable-2-Data.db')
|
59
|
+
assert(metadata_content.include? 'SSTable-3-Data.db')
|
60
|
+
assert(metadata_content.include? 'SSTable-4-Data.db')
|
61
|
+
|
62
|
+
# cleanup
|
63
|
+
hadoop.delete('test/hadoop')
|
64
|
+
hadoop.delete('test/cassandra')
|
65
|
+
end
|
66
|
+
|
67
|
+
def test_restore
|
68
|
+
hadoop = HadoopStub.new('test/hadoop')
|
69
|
+
backup_tool = create_new_snapshot(hadoop, 'node1', '2016_04_22', [1, 2])
|
70
|
+
|
71
|
+
# restore a newly created snapshot
|
72
|
+
backup_tool.restore_snapshot('node1', '2016_04_22', 'test/restore')
|
73
|
+
|
74
|
+
restored_files = hadoop.list_files('test/restore')
|
75
|
+
# two files were restored
|
76
|
+
assert_equal(2, restored_files.size)
|
77
|
+
assert_equal('test/restore/SSTable-1-Data.db', restored_files[0])
|
78
|
+
assert_equal('test/restore/SSTable-2-Data.db', restored_files[1])
|
79
|
+
|
80
|
+
# cleanup
|
81
|
+
hadoop.delete('test/hadoop')
|
82
|
+
hadoop.delete('test/restore')
|
83
|
+
hadoop.delete('test/cassandra')
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_delete
|
87
|
+
hadoop = HadoopStub.new('test/hadoop')
|
88
|
+
backup_tool = create_new_snapshot(hadoop, 'node1', '2016_04_22', [1, 2])
|
89
|
+
|
90
|
+
# delete a newly created snapshot
|
91
|
+
backup_tool.delete_snapshots(node: 'node1', date: '2016_04_22')
|
92
|
+
|
93
|
+
remote_files = hadoop.list_files('test/hadoop')
|
94
|
+
assert_equal(0, remote_files.size)
|
95
|
+
|
96
|
+
hadoop.delete('test/cassandra')
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_backup_flag
|
100
|
+
hadoop = HadoopStub.new('test/hadoop')
|
101
|
+
backup_tool = create_new_snapshot(hadoop, 'node1', '2016_04_22', [1, 2])
|
102
|
+
|
103
|
+
backup_tool.create_backup_flag('2016_04_22')
|
104
|
+
|
105
|
+
remote_files = hadoop.list_files('test/hadoop')
|
106
|
+
assert_equal(4, remote_files.size)
|
107
|
+
# Flag is created at cluster level
|
108
|
+
assert_equal('test/hadoop/cass_snap_metadata/cluster1/BACKUP_COMPLETED_2016_04_22', remote_files[0])
|
109
|
+
|
110
|
+
# cleanup
|
111
|
+
hadoop.delete('test/hadoop')
|
112
|
+
hadoop.delete('test/cassandra')
|
113
|
+
end
|
114
|
+
|
115
|
+
def test_get_backup_flag
|
116
|
+
hadoop = HadoopStub.new('test/hadoop')
|
117
|
+
backup_tool = create_new_snapshot(hadoop, 'node1', '2016_04_22', [1, 2])
|
118
|
+
|
119
|
+
backup_tool.create_backup_flag('2016_04_22')
|
120
|
+
flags = backup_tool.get_backup_flags
|
121
|
+
|
122
|
+
# One flag found
|
123
|
+
assert_equal(1, flags.size)
|
124
|
+
# Flag points to the correct file
|
125
|
+
assert_equal('cluster1', flags[0].cluster)
|
126
|
+
assert_equal('BACKUP_COMPLETED_2016_04_22', flags[0].file)
|
127
|
+
|
128
|
+
# cleanup
|
129
|
+
hadoop.delete('test/hadoop')
|
130
|
+
hadoop.delete('test/cassandra')
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_cleanup
|
134
|
+
hadoop = HadoopStub.new('test/hadoop')
|
135
|
+
retention_days = 30
|
136
|
+
|
137
|
+
date_31_days_back = (Date.today - 31).strftime('%Y_%m_%d')
|
138
|
+
date_30_days_back = (Date.today - 30).strftime('%Y_%m_%d')
|
139
|
+
|
140
|
+
# Two backups on two nodes
|
141
|
+
create_new_snapshot(hadoop, 'node1', date_31_days_back, [1, 2, 3, 4])
|
142
|
+
create_new_snapshot(hadoop, 'node2', date_31_days_back, [1, 2, 3, 4])
|
143
|
+
create_new_snapshot(hadoop, 'node1', date_30_days_back, [3, 4, 5, 6])
|
144
|
+
backup_tool = create_new_snapshot(hadoop, 'node2', date_30_days_back, [4, 5, 6, 7])
|
145
|
+
|
146
|
+
# Both backups are marked as completed
|
147
|
+
backup_tool.create_backup_flag(date_31_days_back)
|
148
|
+
backup_tool.create_backup_flag(date_30_days_back)
|
149
|
+
backup_tool.create_backup_flag(date_30_days_back)
|
150
|
+
|
151
|
+
backup_tool.cleanup(retention_days)
|
152
|
+
|
153
|
+
# Two snapshots were deleted, two were kept
|
154
|
+
snapshots = backup_tool.search_snapshots
|
155
|
+
assert_equal(2, snapshots.size)
|
156
|
+
assert_equal('node1', snapshots[0].node)
|
157
|
+
assert_equal(date_30_days_back, snapshots[0].date)
|
158
|
+
assert_equal('node2', snapshots[1].node)
|
159
|
+
assert_equal(date_30_days_back, snapshots[1].date)
|
160
|
+
|
161
|
+
# One backup flag was deleted, one was kept.
|
162
|
+
backup_flags = backup_tool.get_backup_flags
|
163
|
+
assert_equal(1, backup_flags.size)
|
164
|
+
assert_equal("BACKUP_COMPLETED_#{date_30_days_back}", backup_flags[0].file)
|
165
|
+
|
166
|
+
# cleanup
|
167
|
+
hadoop.delete('test/hadoop')
|
168
|
+
hadoop.delete('test/cassandra')
|
169
|
+
end
|
170
|
+
|
171
|
+
def create_new_snapshot(hadoop, node, date, file_indexes)
|
172
|
+
logger = Logger.new(STDOUT)
|
173
|
+
cassandra = CassandraStub.new('cluster1', node, date, file_indexes)
|
174
|
+
backup_tool = BackupTool.new(cassandra, hadoop, logger)
|
175
|
+
|
176
|
+
backup_tool.new_snapshot
|
177
|
+
|
178
|
+
backup_tool
|
179
|
+
end
|
180
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cassback
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Vincent Van Hollebeke
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-04-
|
12
|
+
date: 2016-04-26 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|
@@ -109,10 +109,36 @@ executables:
|
|
109
109
|
extensions: []
|
110
110
|
extra_rdoc_files: []
|
111
111
|
files:
|
112
|
+
- ".gitignore"
|
113
|
+
- ".rubocop.yml_disabled"
|
114
|
+
- Gemfile
|
115
|
+
- LICENSE
|
116
|
+
- README.md
|
117
|
+
- Rakefile.rb
|
112
118
|
- bin/cassback
|
119
|
+
- bin/console
|
120
|
+
- bin/setup
|
121
|
+
- cassback.gemspec
|
122
|
+
- conf/local.yml
|
123
|
+
- conf/preprod.yml
|
124
|
+
- conf/prod.yml
|
113
125
|
- lib/backuptool.rb
|
114
126
|
- lib/cassandra.rb
|
127
|
+
- lib/cassback/version.rb
|
115
128
|
- lib/hadoop.rb
|
129
|
+
- scripts/deploy.sh
|
130
|
+
- scripts/manualbackups/ansible.cfg
|
131
|
+
- scripts/manualbackups/inventory.txt
|
132
|
+
- scripts/manualbackups/play_book.sh
|
133
|
+
- scripts/manualbackups/playbooks/backups.yml
|
134
|
+
- scripts/manualbackups/roles/planb/files/backup.sh
|
135
|
+
- scripts/manualbackups/roles/planb/files/httpfs.sh
|
136
|
+
- scripts/manualbackups/roles/planb/files/krb5.conf
|
137
|
+
- scripts/manualbackups/roles/planb/tasks/main.yml
|
138
|
+
- scripts/pre-push
|
139
|
+
- test/cassandra_stub.rb
|
140
|
+
- test/hadoop_stub.rb
|
141
|
+
- test/test_backuptool.rb
|
116
142
|
homepage: http://rubygems.org/gems/cassback
|
117
143
|
licenses:
|
118
144
|
- Apache-2.0
|
@@ -133,9 +159,11 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
133
159
|
version: '0'
|
134
160
|
requirements: []
|
135
161
|
rubyforge_project:
|
136
|
-
rubygems_version: 2.
|
162
|
+
rubygems_version: 2.4.8
|
137
163
|
signing_key:
|
138
164
|
specification_version: 4
|
139
165
|
summary: Cassandra backup to HDFS.
|
140
|
-
test_files:
|
141
|
-
|
166
|
+
test_files:
|
167
|
+
- test/cassandra_stub.rb
|
168
|
+
- test/hadoop_stub.rb
|
169
|
+
- test/test_backuptool.rb
|