logstash-input-s3 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NzkxNDE2MTI2NWM5YzVjNGNlNzU5ZjI3OTMxM2U2OWVkZjQ1ZDJmOQ==
5
+ data.tar.gz: !binary |-
6
+ YzlhZWI4NGMyZTY1N2E0Njk5M2Y3NjI1YTIwNzYyNWJhM2QzZDI2ZQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ZDkzNTVhYmE2YTg4ZWYyZTNiMjQ5M2RhMjRiNmZiNzE0ZDFhZjIxMTIwNDM0
10
+ NmFhMzMyZmY2MTkwZWM3Y2ZlZDhiYzQ1NThhNzQ2ODgyYThjNzFiNDhkMmE3
11
+ Zjc5N2U2ODlkYmYwM2RiZmQ4Y2I0OGVmZTc0YTdhN2IzMGI3YmU=
12
+ data.tar.gz: !binary |-
13
+ NjI1NmFiOTY1ZWNiNGZiZTQ4YjE4ZjFiNGY3ZjU3MGU1NDgwYTQyODI0YTEx
14
+ MGNkMjA5MGNmNTdmNTJlNDAyNjlhODc4ZDkzYWFkNWM4MzU3ZTJmZjZlZDdk
15
+ Y2I5ODIxM2RiYmEwNTA4ZmU5ZGQyN2YxZjMzNjc4NzEyYWM3NmM=
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ Gemfile.lock
3
+ .bundle
4
+ vendor
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'http://rubygems.org'
2
+ gem 'rake'
3
+ gem 'gem_publisher'
4
+ gem 'archive-tar-minitar'
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2012-2014 Elasticsearch <http://www.elasticsearch.org>
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ @files=[]
2
+
3
+ task :default do
4
+ system("rake -T")
5
+ end
6
+
@@ -0,0 +1,278 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/base"
3
+ require "logstash/namespace"
4
+
5
+ require "time"
6
+ require "tmpdir"
7
+
8
+ # Stream events from files from a S3 bucket.
9
+ #
10
+ # Each line from each file generates an event.
11
+ # Files ending in '.gz' are handled as gzip'ed files.
12
+ class LogStash::Inputs::S3 < LogStash::Inputs::Base
13
+ config_name "s3"
14
+ milestone 1
15
+
16
+ # TODO(sissel): refactor to use 'line' codec (requires removing both gzip
17
+ # support and readline usage). Support gzip through a gzip codec! ;)
18
+ default :codec, "plain"
19
+
20
+ # The credentials of the AWS account used to access the bucket.
21
+ # Credentials can be specified:
22
+ # - As an ["id","secret"] array
23
+ # - As a path to a file containing AWS_ACCESS_KEY_ID=... and AWS_SECRET_ACCESS_KEY=...
24
+ # - In the environment, if not set (using variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY)
25
+ config :credentials, :validate => :array, :default => []
26
+
27
+ # The name of the S3 bucket.
28
+ config :bucket, :validate => :string, :required => true
29
+
30
+ # The AWS region for your bucket.
31
+ config :region, :validate => ["us-east-1", "us-west-1", "us-west-2",
32
+ "eu-west-1", "ap-southeast-1", "ap-southeast-2",
33
+ "ap-northeast-1", "sa-east-1", "us-gov-west-1"],
34
+ :deprecated => "'region' has been deprecated in favor of 'region_endpoint'"
35
+
36
+ # The AWS region for your bucket.
37
+ config :region_endpoint, :validate => ["us-east-1", "us-west-1", "us-west-2",
38
+ "eu-west-1", "ap-southeast-1", "ap-southeast-2",
39
+ "ap-northeast-1", "sa-east-1", "us-gov-west-1"], :default => "us-east-1"
40
+
41
+ # If specified, the prefix the filenames in the bucket must match (not a regexp)
42
+ config :prefix, :validate => :string, :default => nil
43
+
44
+ # Where to write the since database (keeps track of the date
45
+ # the last handled file was added to S3). The default will write
46
+ # sincedb files to some path matching "$HOME/.sincedb*"
47
+ config :sincedb_path, :validate => :string, :default => nil
48
+
49
+ # Name of a S3 bucket to backup processed files to.
50
+ config :backup_to_bucket, :validate => :string, :default => nil
51
+
52
+ # Path of a local directory to backup processed files to.
53
+ config :backup_to_dir, :validate => :string, :default => nil
54
+
55
+ # Whether to delete processed files from the original bucket.
56
+ config :delete, :validate => :boolean, :default => false
57
+
58
+ # Interval to wait between to check the file list again after a run is finished.
59
+ # Value is in seconds.
60
+ config :interval, :validate => :number, :default => 60
61
+
62
+ public
63
+ def register
64
+ require "digest/md5"
65
+ require "aws-sdk"
66
+
67
+ @region_endpoint = @region if @region && !@region.empty?
68
+
69
+ @logger.info("Registering s3 input", :bucket => @bucket, :region_endpoint => @region_endpoint)
70
+
71
+ if @credentials.length == 0
72
+ @access_key_id = ENV['AWS_ACCESS_KEY_ID']
73
+ @secret_access_key = ENV['AWS_SECRET_ACCESS_KEY']
74
+ elsif @credentials.length == 1
75
+ File.open(@credentials[0]) { |f| f.each do |line|
76
+ unless (/^\#/.match(line))
77
+ if(/\s*=\s*/.match(line))
78
+ param, value = line.split('=', 2)
79
+ param = param.chomp().strip()
80
+ value = value.chomp().strip()
81
+ if param.eql?('AWS_ACCESS_KEY_ID')
82
+ @access_key_id = value
83
+ elsif param.eql?('AWS_SECRET_ACCESS_KEY')
84
+ @secret_access_key = value
85
+ end
86
+ end
87
+ end
88
+ end
89
+ }
90
+ elsif @credentials.length == 2
91
+ @access_key_id = @credentials[0]
92
+ @secret_access_key = @credentials[1]
93
+ else
94
+ raise ArgumentError.new('Credentials must be of the form "/path/to/file" or ["id", "secret"]')
95
+ end
96
+
97
+ if @access_key_id.nil? or @secret_access_key.nil?
98
+ raise ArgumentError.new('Missing AWS credentials')
99
+ end
100
+
101
+ if @bucket.nil?
102
+ raise ArgumentError.new('Missing AWS bucket')
103
+ end
104
+
105
+ if @sincedb_path.nil?
106
+ if ENV['HOME'].nil?
107
+ raise ArgumentError.new('No HOME or sincedb_path set')
108
+ end
109
+ @sincedb_path = File.join(ENV["HOME"], ".sincedb_" + Digest::MD5.hexdigest("#{@bucket}+#{@prefix}"))
110
+ end
111
+
112
+ s3 = AWS::S3.new(
113
+ :access_key_id => @access_key_id,
114
+ :secret_access_key => @secret_access_key,
115
+ :region => @region_endpoint
116
+ )
117
+
118
+ @s3bucket = s3.buckets[@bucket]
119
+
120
+ unless @backup_to_bucket.nil?
121
+ @backup_bucket = s3.buckets[@backup_to_bucket]
122
+ unless @backup_bucket.exists?
123
+ s3.buckets.create(@backup_to_bucket)
124
+ end
125
+ end
126
+
127
+ unless @backup_to_dir.nil?
128
+ Dir.mkdir(@backup_to_dir, 0700) unless File.exists?(@backup_to_dir)
129
+ end
130
+
131
+ end # def register
132
+
133
+ public
134
+ def run(queue)
135
+ loop do
136
+ process_new(queue)
137
+ sleep(@interval)
138
+ end
139
+ finished
140
+ end # def run
141
+
142
+ private
143
+ def process_new(queue, since=nil)
144
+
145
+ if since.nil?
146
+ since = sincedb_read()
147
+ end
148
+
149
+ objects = list_new(since)
150
+ objects.each do |k|
151
+ @logger.debug("S3 input processing", :bucket => @bucket, :key => k)
152
+ lastmod = @s3bucket.objects[k].last_modified
153
+ process_log(queue, k)
154
+ sincedb_write(lastmod)
155
+ end
156
+
157
+ end # def process_new
158
+
159
+ private
160
+ def list_new(since=nil)
161
+
162
+ if since.nil?
163
+ since = Time.new(0)
164
+ end
165
+
166
+ objects = {}
167
+ @s3bucket.objects.with_prefix(@prefix).each do |log|
168
+ if log.last_modified > since
169
+ objects[log.key] = log.last_modified
170
+ end
171
+ end
172
+
173
+ return sorted_objects = objects.keys.sort {|a,b| objects[a] <=> objects[b]}
174
+
175
+ end # def list_new
176
+
177
+ private
178
+ def process_log(queue, key)
179
+
180
+ object = @s3bucket.objects[key]
181
+ tmp = Dir.mktmpdir("logstash-")
182
+ begin
183
+ filename = File.join(tmp, File.basename(key))
184
+ File.open(filename, 'wb') do |s3file|
185
+ object.read do |chunk|
186
+ s3file.write(chunk)
187
+ end
188
+ end
189
+ process_local_log(queue, filename)
190
+ unless @backup_to_bucket.nil?
191
+ backup_object = @backup_bucket.objects[key]
192
+ backup_object.write(Pathname.new(filename))
193
+ end
194
+ unless @backup_to_dir.nil?
195
+ FileUtils.cp(filename, @backup_to_dir)
196
+ end
197
+ if @delete
198
+ object.delete()
199
+ end
200
+ end
201
+ FileUtils.remove_entry_secure(tmp, force=true)
202
+
203
+ end # def process_log
204
+
205
+ private
206
+ def process_local_log(queue, filename)
207
+
208
+ metadata = {
209
+ :version => nil,
210
+ :format => nil,
211
+ }
212
+ File.open(filename) do |file|
213
+ if filename.end_with?('.gz')
214
+ gz = Zlib::GzipReader.new(file)
215
+ gz.each_line do |line|
216
+ metadata = process_line(queue, metadata, line)
217
+ end
218
+ else
219
+ file.each do |line|
220
+ metadata = process_line(queue, metadata, line)
221
+ end
222
+ end
223
+ end
224
+
225
+ end # def process_local_log
226
+
227
+ private
228
+ def process_line(queue, metadata, line)
229
+
230
+ if /#Version: .+/.match(line)
231
+ junk, version = line.strip().split(/#Version: (.+)/)
232
+ unless version.nil?
233
+ metadata[:version] = version
234
+ end
235
+ elsif /#Fields: .+/.match(line)
236
+ junk, format = line.strip().split(/#Fields: (.+)/)
237
+ unless format.nil?
238
+ metadata[:format] = format
239
+ end
240
+ else
241
+ @codec.decode(line) do |event|
242
+ decorate(event)
243
+ unless metadata[:version].nil?
244
+ event["cloudfront_version"] = metadata[:version]
245
+ end
246
+ unless metadata[:format].nil?
247
+ event["cloudfront_fields"] = metadata[:format]
248
+ end
249
+ queue << event
250
+ end
251
+ end
252
+ return metadata
253
+
254
+ end # def process_line
255
+
256
+ private
257
+ def sincedb_read()
258
+
259
+ if File.exists?(@sincedb_path)
260
+ since = Time.parse(File.read(@sincedb_path).chomp.strip)
261
+ else
262
+ since = Time.new(0)
263
+ end
264
+ return since
265
+
266
+ end # def sincedb_read
267
+
268
+ private
269
+ def sincedb_write(since=nil)
270
+
271
+ if since.nil?
272
+ since = Time.now()
273
+ end
274
+ File.open(@sincedb_path, 'w') { |file| file.write(since.to_s) }
275
+
276
+ end # def sincedb_write
277
+
278
+ end # class LogStash::Inputs::S3
@@ -0,0 +1,29 @@
1
+ Gem::Specification.new do |s|
2
+
3
+ s.name = 'logstash-input-s3'
4
+ s.version = '0.1.0'
5
+ s.licenses = ['Apache License (2.0)']
6
+ s.summary = "Stream events from files from a S3 bucket."
7
+ s.description = "Stream events from files from a S3 bucket."
8
+ s.authors = ["Elasticsearch"]
9
+ s.email = 'richard.pijnenburg@elasticsearch.com'
10
+ s.homepage = "http://logstash.net/"
11
+ s.require_paths = ["lib"]
12
+
13
+ # Files
14
+ s.files = `git ls-files`.split($\)+::Dir.glob('vendor/*')
15
+
16
+ # Tests
17
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
18
+
19
+ # Special flag to let us know this is actually a logstash plugin
20
+ s.metadata = { "logstash_plugin" => "true", "group" => "input" }
21
+
22
+ # Gem dependencies
23
+ s.add_runtime_dependency 'logstash', '>= 1.4.0', '< 2.0.0'
24
+
25
+ s.add_runtime_dependency 'logstash-codec-plain'
26
+ s.add_runtime_dependency 'aws-sdk'
27
+
28
+ end
29
+
@@ -0,0 +1,9 @@
1
+ require "gem_publisher"
2
+
3
+ desc "Publish gem to RubyGems.org"
4
+ task :publish_gem do |t|
5
+ gem_file = Dir.glob(File.expand_path('../*.gemspec',File.dirname(__FILE__))).first
6
+ gem = GemPublisher.publish_if_updated(gem_file, :rubygems)
7
+ puts "Published #{gem}" if gem
8
+ end
9
+
@@ -0,0 +1,169 @@
1
+ require "net/http"
2
+ require "uri"
3
+ require "digest/sha1"
4
+
5
+ def vendor(*args)
6
+ return File.join("vendor", *args)
7
+ end
8
+
9
+ directory "vendor/" => ["vendor"] do |task, args|
10
+ mkdir task.name
11
+ end
12
+
13
+ def fetch(url, sha1, output)
14
+
15
+ puts "Downloading #{url}"
16
+ actual_sha1 = download(url, output)
17
+
18
+ if actual_sha1 != sha1
19
+ fail "SHA1 does not match (expected '#{sha1}' but got '#{actual_sha1}')"
20
+ end
21
+ end # def fetch
22
+
23
+ def file_fetch(url, sha1)
24
+ filename = File.basename( URI(url).path )
25
+ output = "vendor/#{filename}"
26
+ task output => [ "vendor/" ] do
27
+ begin
28
+ actual_sha1 = file_sha1(output)
29
+ if actual_sha1 != sha1
30
+ fetch(url, sha1, output)
31
+ end
32
+ rescue Errno::ENOENT
33
+ fetch(url, sha1, output)
34
+ end
35
+ end.invoke
36
+
37
+ return output
38
+ end
39
+
40
+ def file_sha1(path)
41
+ digest = Digest::SHA1.new
42
+ fd = File.new(path, "r")
43
+ while true
44
+ begin
45
+ digest << fd.sysread(16384)
46
+ rescue EOFError
47
+ break
48
+ end
49
+ end
50
+ return digest.hexdigest
51
+ ensure
52
+ fd.close if fd
53
+ end
54
+
55
+ def download(url, output)
56
+ uri = URI(url)
57
+ digest = Digest::SHA1.new
58
+ tmp = "#{output}.tmp"
59
+ Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == "https")) do |http|
60
+ request = Net::HTTP::Get.new(uri.path)
61
+ http.request(request) do |response|
62
+ fail "HTTP fetch failed for #{url}. #{response}" if [200, 301].include?(response.code)
63
+ size = (response["content-length"].to_i || -1).to_f
64
+ count = 0
65
+ File.open(tmp, "w") do |fd|
66
+ response.read_body do |chunk|
67
+ fd.write(chunk)
68
+ digest << chunk
69
+ if size > 0 && $stdout.tty?
70
+ count += chunk.bytesize
71
+ $stdout.write(sprintf("\r%0.2f%%", count/size * 100))
72
+ end
73
+ end
74
+ end
75
+ $stdout.write("\r \r") if $stdout.tty?
76
+ end
77
+ end
78
+
79
+ File.rename(tmp, output)
80
+
81
+ return digest.hexdigest
82
+ rescue SocketError => e
83
+ puts "Failure while downloading #{url}: #{e}"
84
+ raise
85
+ ensure
86
+ File.unlink(tmp) if File.exist?(tmp)
87
+ end # def download
88
+
89
+ def untar(tarball, &block)
90
+ require "archive/tar/minitar"
91
+ tgz = Zlib::GzipReader.new(File.open(tarball))
92
+ # Pull out typesdb
93
+ tar = Archive::Tar::Minitar::Input.open(tgz)
94
+ tar.each do |entry|
95
+ path = block.call(entry)
96
+ next if path.nil?
97
+ parent = File.dirname(path)
98
+
99
+ mkdir_p parent unless File.directory?(parent)
100
+
101
+ # Skip this file if the output file is the same size
102
+ if entry.directory?
103
+ mkdir path unless File.directory?(path)
104
+ else
105
+ entry_mode = entry.instance_eval { @mode } & 0777
106
+ if File.exists?(path)
107
+ stat = File.stat(path)
108
+ # TODO(sissel): Submit a patch to archive-tar-minitar upstream to
109
+ # expose headers in the entry.
110
+ entry_size = entry.instance_eval { @size }
111
+ # If file sizes are same, skip writing.
112
+ next if stat.size == entry_size && (stat.mode & 0777) == entry_mode
113
+ end
114
+ puts "Extracting #{entry.full_name} from #{tarball} #{entry_mode.to_s(8)}"
115
+ File.open(path, "w") do |fd|
116
+ # eof? check lets us skip empty files. Necessary because the API provided by
117
+ # Archive::Tar::Minitar::Reader::EntryStream only mostly acts like an
118
+ # IO object. Something about empty files in this EntryStream causes
119
+ # IO.copy_stream to throw "can't convert nil into String" on JRuby
120
+ # TODO(sissel): File a bug about this.
121
+ while !entry.eof?
122
+ chunk = entry.read(16384)
123
+ fd.write(chunk)
124
+ end
125
+ #IO.copy_stream(entry, fd)
126
+ end
127
+ File.chmod(entry_mode, path)
128
+ end
129
+ end
130
+ tar.close
131
+ File.unlink(tarball) if File.file?(tarball)
132
+ end # def untar
133
+
134
+ def ungz(file)
135
+
136
+ outpath = file.gsub('.gz', '')
137
+ tgz = Zlib::GzipReader.new(File.open(file))
138
+ begin
139
+ File.open(outpath, "w") do |out|
140
+ IO::copy_stream(tgz, out)
141
+ end
142
+ File.unlink(file)
143
+ rescue
144
+ File.unlink(outpath) if File.file?(outpath)
145
+ raise
146
+ end
147
+ tgz.close
148
+ end
149
+
150
+ desc "Process any vendor files required for this plugin"
151
+ task "vendor" do |task, args|
152
+
153
+ @files.each do |file|
154
+ download = file_fetch(file['url'], file['sha1'])
155
+ if download =~ /.tar.gz/
156
+ prefix = download.gsub('.tar.gz', '').gsub('vendor/', '')
157
+ untar(download) do |entry|
158
+ if !file['files'].nil?
159
+ next unless file['files'].include?(entry.full_name.gsub(prefix, ''))
160
+ out = entry.full_name.split("/").last
161
+ end
162
+ File.join('vendor', out)
163
+ end
164
+ elsif download =~ /.gz/
165
+ ungz(download)
166
+ end
167
+ end
168
+
169
+ end
@@ -0,0 +1,5 @@
1
+ require 'spec_helper'
2
+ require 'logstash/inputs/s3'
3
+
4
+ describe LogStash::Inputs::S3 do
5
+ end
metadata ADDED
@@ -0,0 +1,103 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: logstash-input-s3
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Elasticsearch
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-11-05 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: logstash
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.4.0
20
+ - - <
21
+ - !ruby/object:Gem::Version
22
+ version: 2.0.0
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.4.0
30
+ - - <
31
+ - !ruby/object:Gem::Version
32
+ version: 2.0.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: logstash-codec-plain
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ name: aws-sdk
49
+ requirement: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :runtime
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ! '>='
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ description: Stream events from files from a S3 bucket.
62
+ email: richard.pijnenburg@elasticsearch.com
63
+ executables: []
64
+ extensions: []
65
+ extra_rdoc_files: []
66
+ files:
67
+ - .gitignore
68
+ - Gemfile
69
+ - LICENSE
70
+ - Rakefile
71
+ - lib/logstash/inputs/s3.rb
72
+ - logstash-input-s3.gemspec
73
+ - rakelib/publish.rake
74
+ - rakelib/vendor.rake
75
+ - spec/inputs/s3_spec.rb
76
+ homepage: http://logstash.net/
77
+ licenses:
78
+ - Apache License (2.0)
79
+ metadata:
80
+ logstash_plugin: 'true'
81
+ group: input
82
+ post_install_message:
83
+ rdoc_options: []
84
+ require_paths:
85
+ - lib
86
+ required_ruby_version: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - ! '>='
89
+ - !ruby/object:Gem::Version
90
+ version: '0'
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ requirements: []
97
+ rubyforge_project:
98
+ rubygems_version: 2.4.1
99
+ signing_key:
100
+ specification_version: 4
101
+ summary: Stream events from files from a S3 bucket.
102
+ test_files:
103
+ - spec/inputs/s3_spec.rb