tailf2kafka 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0511b332f01cad011509e5312d35a44406996394
4
+ data.tar.gz: 1ca828eed00c80a7896b3cf384292e577d8832f5
5
+ SHA512:
6
+ metadata.gz: bbe8203735d60e6483a5aef2f0a3f6c0473828753fa5d4649313b56a147b96e63fa300bec9f6a5900995c6c56578b69e4db6af88d572c1e946fbb8a687508f56
7
+ data.tar.gz: 1e9c1d14ad055d267a005208a5a2a59bd9a5786984bda8b323aff08b6703a75c596761ce688c9ec1552d8fd3a15dc6e5b78e1090ab7e9dd740890b937ab85997
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2015 Supersonic LTD
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,51 @@
1
+ # Tailf2Kafka
2
+
3
+ Watch and tail files in dirs with specified filename time based patterns and push them to kafka.
4
+
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'tailf2kafka'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install tailf2kafka
19
+
20
+ ## Usage
21
+
22
+ $ tailf2kafka -h
23
+ Usage: tailf2kafka [options]
24
+ --config PATH Path to settings config
25
+ -h, --help Display this screen
26
+ $
27
+
28
+ ## Config
29
+
30
+ tailf:
31
+ files:
32
+ - topic: haproxy
33
+ prefix: /var/log/haproxy/haproxy
34
+ time_pattern: ".%Y-%m-%d.%H"
35
+ position_file: "/var/lib/haproxy/tail2kafka.offsets"
36
+ flush_interval: 1
37
+ max_batch_lines: 1024
38
+ from_begining: true
39
+ delete_old_tailed_files: true
40
+ kafka:
41
+ brokers: ["broker1:9092", "broker2:9092", "broker3:9092"]
42
+ producer_type: sync
43
+
44
+ ## Contributing
45
+
46
+ 1. Fork it
47
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
48
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
49
+ 4. Push to the branch (`git push origin my-new-feature`)
50
+ 5. Create new Pull Request
51
+ 6. Go to 1
data/bin/tailf2kafka ADDED
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+ require 'poseidon'
5
+ require 'yaml'
6
+ require 'hash_symbolizer'
7
+ require 'schash'
8
+ require 'rb-inotify'
9
+ require 'timers'
10
+ require 'socket'
11
+ require 'fileutils'
12
+
13
+ $stdout.sync = true
14
+
15
+ Thread.abort_on_exception = true
16
+
17
+ @config = nil
18
+
19
+ opts = OptionParser.new
20
+ opts.banner = "Usage: #{$0} [options]"
21
+ opts.on( '--config PATH', String, 'Path to settings config' ) { |c| @config = c }
22
+ opts.on( '-h', '--help', 'Display this screen' ) { puts opts; exit 0 }
23
+ opts.parse!
24
+
25
+ unless @config
26
+ puts opts
27
+ exit 1
28
+ end
29
+
30
+ @settings = YAML.load_file(@config).symbolize_keys(true)
31
+
32
+ validator = Schash::Validator.new do
33
+ {
34
+ tailf: {
35
+ files: array_of({
36
+ topic: string,
37
+ prefix: string,
38
+ suffix: optional(string),
39
+ time_pattern: string,
40
+ }),
41
+ position_file: string,
42
+ flush_interval: integer,
43
+ max_batch_lines: integer,
44
+ from_begining: boolean,
45
+ delete_old_tailed_files: optional(boolean),
46
+ },
47
+ kafka: {
48
+ brokers: array_of(string),
49
+ producer_type: match(/^(sync|async)$/),
50
+ produce: optional(boolean),
51
+ },
52
+ }
53
+ end
54
+
55
+ unless validator.validate(@settings).empty?
56
+ puts "ERROR: bad settings "
57
+ pp validator.validate(@settings)
58
+ exit 1
59
+ end
60
+
61
+ @settings[:tailf][:files] = @settings[:tailf][:files].map{|h| h.symbolize_keys(true)}
62
+
63
+ @mutex = Mutex.new
64
+
65
+ @create_notifier = INotify::Notifier.new
66
+ @delete_notifier = INotify::Notifier.new
67
+ @tailf_notifier = INotify::Notifier.new
68
+
69
+ @dirs = {}
70
+ @files = {}
71
+ @threads = {}
72
+ @position_file = @settings[:tailf][:position_file]
73
+ @flush_interval = @settings[:tailf][:flush_interval]
74
+ @max_batch_lines = @settings[:tailf][:max_batch_lines]
75
+ @from_begining = @settings[:tailf][:from_begining]
76
+ @delete_old_tailed_files = @settings[:tailf].has_key?(:delete_old_tailed_files) ? @settings[:tailf][:delete_old_tailed_files] : false
77
+ @brokers = @settings[:kafka][:brokers]
78
+ @producer_type = @settings[:kafka][:producer_type].to_sym
79
+ @produce = @settings[:kafka].has_key?(:produce) ? @settings[:kafka][:produce] : true
80
+
81
+ def write_position_file
82
+ @mutex.synchronize do
83
+ File.open(@position_file, 'w') do |file|
84
+ @files.each do |path, attrs|
85
+ file.puts "#{path} #{attrs[:pattern]} #{attrs[:topic]} #{attrs[:inode]} #{attrs[:offset]}"
86
+ end
87
+ end
88
+ end
89
+ end
90
+
91
+ def load_position_file
92
+ if File.exist?(@position_file)
93
+ IO.readlines(@position_file).each do |line|
94
+ path, pattern, topic, inode, offset = line.split(' ')
95
+ #Load state only for that exist with same inode and were not truncated/rewinded.
96
+ if File.exists?(path) and File.stat(path).ino == inode.to_i and File.stat(path).size >= offset.to_i
97
+ @files[path] = { :pattern => pattern, :topic => topic, :inode => inode.to_i, :offset => offset.to_i }
98
+ end
99
+ end
100
+ end
101
+ write_position_file
102
+ end
103
+
104
+ load_position_file
105
+
106
+ @topics = @settings[:tailf][:files].map{|tailf_file| tailf_file[:topic]}
107
+ @producer = Poseidon::Producer.new(@brokers, "#{Socket.gethostname}", :type => @producer_type, :compression_codec => :snappy, :compressed_topics => @topics) if @produce
108
+
109
+ @producer_queue = SizedQueue.new(10)
110
+
111
+ @producer_thread = Thread.new do
112
+ loop do
113
+ batch = @producer_queue.pop
114
+ begin
115
+ @producer.send_messages(batch[:messages]) if @produce
116
+ rescue Poseidon::Errors::UnableToFetchMetadata
117
+ puts "Got Poseidon::Errors::UnableToFetchMetadata while trying to produce kafka messages, retrying in 1 second ..."
118
+ sleep 1
119
+ retry
120
+ end
121
+ @files[batch[:path]][:offset] = batch[:offset]
122
+ end
123
+ end
124
+
125
+ def kafka_produce(path, buffer, offset)
126
+ messages = []
127
+ buffer.each do |msg|
128
+ messages << Poseidon::MessageToSend.new(@files[path][:topic], msg)
129
+ end
130
+ @producer_queue.push({ :path => path, :messages => messages, :offset => offset})
131
+ end
132
+
133
+ def tailf(path)
134
+ file = File.open(path, 'r')
135
+ @files[path][:fd] = file
136
+ file.seek(@files[path][:offset], IO::SEEK_SET)
137
+ loop do #Fast read file in batches until we reach EOF upon which we start the tailf modify watcher
138
+ batch = file.each_line.take(@max_batch_lines)
139
+ break if batch.empty?
140
+ kafka_produce(path, batch, file.pos)
141
+ end
142
+ @tailf_notifier.watch(path, :modify) do |event|
143
+ unless file.closed?
144
+ batch = file.each_line.take(@max_batch_lines)
145
+ kafka_produce(path, batch, file.pos) unless batch.empty?
146
+ else
147
+ puts "watcher got modify event on closed file #{event.name}"
148
+ end
149
+ end
150
+ end
151
+
152
+ @time_regexp_hash = {
153
+ 'Y' => '[0-9]{4}',
154
+ 'm' => '[0-9]{2}',
155
+ 'd' => '[0-9]{2}',
156
+ 'H' => '[0-9]{2}'
157
+ }
158
+
159
+ def time_pattern_to_regexp(pattern)
160
+ pattern.gsub(/%([^%])/) do
161
+ match = $1
162
+ @time_regexp_hash.has_key?(match) ? @time_regexp_hash[match] : match
163
+ end
164
+ end
165
+
166
+ #Scan existing files that match watched prefixes and start failing them
167
+ @settings[:tailf][:files].each do |tailf_file|
168
+ dir = File.dirname(tailf_file[:prefix])
169
+ if File.exists?(dir) and File.directory?(dir)
170
+ @dirs[dir] ||= []
171
+ @dirs[dir] << { :prefix => File.basename(tailf_file[:prefix]), :pattern => tailf_file[:time_pattern], :suffix => "#{tailf_file[:suffix]}" }
172
+ Dir.glob("#{tailf_file[:prefix]}*#{tailf_file[:suffix]}").each do |path|
173
+ if path.match(Regexp.new(time_pattern_to_regexp(tailf_file[:time_pattern])))
174
+ unless File.directory?(path)
175
+ #Populate state only if it was not loaded from position file
176
+ unless @files.has_key?(path)
177
+ @files[path] = { :pattern => tailf_file[:time_pattern], :topic => tailf_file[:topic], :inode => File.stat(path).ino, :offset => 0 }
178
+ @files[path][:offset] = File.stat(path).size unless @from_begining
179
+ end
180
+ @threads[path] = Thread.new { tailf(path) } unless @threads.has_key?(path)
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
186
+
187
+ def delete_old_tailed_files
188
+ @mutex.synchronize do
189
+ @files.each_key do |path|
190
+ unless path.match(Regexp.new(Time.now.strftime(@files[path][:pattern])))
191
+ if File.exists?(path) and File.stat(path).ino == @files[path][:inode] and File.stat(path).size == @files[path][:offset]
192
+ puts "Deleteing old time pattern fully kafka produced file #{path}"
193
+ FileUtils.rm_r(path)
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
199
+
200
+ @timers = Timers::Group.new
201
+ @uploads_timer = @timers.every(@flush_interval) { write_position_file }
202
+ @delete_old_tailed_files_timer = @timers.every(60) { delete_old_tailed_files } if @delete_old_tailed_files
203
+ Thread.new { loop { @timers.wait } }
204
+
205
+ @dirs.each_key do |dir|
206
+
207
+ @create_notifier.watch(dir, :create, :moved_to) do |event|
208
+ @mutex.synchronize do
209
+ path = "#{dir}/#{event.name}"
210
+ matches = @dirs[dir].select{|h| event.name.match(Regexp.new(h[:prefix] + time_pattern_to_regexp(h[:pattern]) + h[:suffix]))}.empty?
211
+ unless matches.empty?
212
+ unless File.directory?(path)
213
+ unless @threads.has_key?(path)
214
+ puts "File #{event.name} was created in / moved into watched dir #{dir}"
215
+ @files[path] = { :pattern => match.first[:pattern], :topic => File.basename(@dirs[dir].detect{|prefix| path.start_with?(prefix)}), :inode => File.stat(path).ino, :offset => 0 }
216
+ @threads[path] = Thread.new { tailf(path) }
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end
222
+
223
+ @delete_notifier.watch(dir, :delete, :moved_from) do |event|
224
+ @mutex.synchronize do
225
+ path = "#{dir}/#{event.name}"
226
+ if @threads.has_key?(path)
227
+ puts "File #{event.name} was deleted / moved from watched dir #{dir}"
228
+ if @threads[path].alive?
229
+ @threads[path].terminate
230
+ @threads[path].join
231
+ end
232
+ @threads.delete(path)
233
+ @files[path][:fd].close unless @files[path][:fd].closed?
234
+ @files.delete(path)
235
+ end
236
+ end
237
+ end
238
+
239
+ end
240
+
241
+ Thread.new { @create_notifier.run }
242
+ Thread.new { @delete_notifier.run }
243
+
244
+ @tailf_notifier.run
@@ -0,0 +1,3 @@
1
+ module Tailf2Kafka
2
+ VERSION ||= '0.1.0'
3
+ end
@@ -0,0 +1,3 @@
1
+ module Tailf2Kafka
2
+ require 'tailf2kafak/version.rb'
3
+ end
@@ -0,0 +1,30 @@
1
+ lib = File.expand_path('../lib/', __FILE__)
2
+ $:.unshift lib unless $:.include?(lib)
3
+
4
+ require "tailf2kafka/version"
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "tailf2kafka"
8
+ s.version = Tailf2Kafka::VERSION
9
+ s.platform = Gem::Platform::RUBY
10
+ s.authors = ["Alexander Piavlo"]
11
+ s.email = ["devops@supersonic.com"]
12
+ s.homepage = "http://github.com/SupersonicAds/tailf2kafka"
13
+ s.summary = "Watch and tail files with specified time based patterns and push them to kafka"
14
+ s.description = "Watch and tail files with specified time based patterns and push them to kafka"
15
+ s.license = 'MIT'
16
+ s.has_rdoc = false
17
+
18
+ s.add_dependency('poseidon')
19
+ s.add_dependency('hash_symbolizer')
20
+ s.add_dependency('schash')
21
+ s.add_dependency('rb-inotify')
22
+ s.add_dependency('timers')
23
+
24
+ s.add_development_dependency('rake')
25
+
26
+ s.files = Dir.glob("{bin,lib}/**/*") + %w(tailf2kafka.gemspec LICENSE README.md)
27
+ s.executables = Dir.glob('bin/**/*').map { |file| File.basename(file) }
28
+ s.test_files = nil
29
+ s.require_paths = ['lib']
30
+ end
metadata ADDED
@@ -0,0 +1,137 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tailf2kafka
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Alexander Piavlo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-09-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: poseidon
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: hash_symbolizer
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: schash
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rb-inotify
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: timers
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Watch and tail files with specified time based patterns and push them
98
+ to kafka
99
+ email:
100
+ - devops@supersonic.com
101
+ executables:
102
+ - tailf2kafka
103
+ extensions: []
104
+ extra_rdoc_files: []
105
+ files:
106
+ - LICENSE
107
+ - README.md
108
+ - bin/tailf2kafka
109
+ - lib/tailf2kafka.rb
110
+ - lib/tailf2kafka/version.rb
111
+ - tailf2kafka.gemspec
112
+ homepage: http://github.com/SupersonicAds/tailf2kafka
113
+ licenses:
114
+ - MIT
115
+ metadata: {}
116
+ post_install_message:
117
+ rdoc_options: []
118
+ require_paths:
119
+ - lib
120
+ required_ruby_version: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ required_rubygems_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ requirements: []
131
+ rubyforge_project:
132
+ rubygems_version: 2.2.2
133
+ signing_key:
134
+ specification_version: 4
135
+ summary: Watch and tail files with specified time based patterns and push them to
136
+ kafka
137
+ test_files: []