fluent-plugin-webhdfs 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in fluent-plugin-webhdfs.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2012- TAGOMORI Satoshi
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Fluent::Plugin::Webhdfs
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'fluent-plugin-webhdfs'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install fluent-plugin-webhdfs
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ Gem::Specification.new do |gem|
3
+ gem.name = "fluent-plugin-webhdfs"
4
+ gem.version = "0.0.1"
5
+ gem.authors = ["TAGOMORI Satoshi"]
6
+ gem.email = ["tagomoris@gmail.com"]
7
+ gem.summary = %q{Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting}
8
+ gem.description = %q{Fluentd plugin to write data on HDFS over WebHDFS}
9
+ gem.homepage = "https://github.com/tagomoris/fluent-plugin-webhdfs"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.require_paths = ["lib"]
15
+
16
+ gem.add_development_dependency "rake"
17
+ gem.add_development_dependency "fluentd"
18
+ gem.add_development_dependency "webhdfs"
19
+ gem.add_runtime_dependency "fluentd"
20
+ gem.add_runtime_dependency "webhdfs"
21
+ end
@@ -0,0 +1,207 @@
1
+ module FluentExt; end
2
+
3
+ module FluentExt::PlainTextFormatterMixin
4
+ #TODO: tests!
5
+
6
+ # config_param :output_data_type, :string, :default => 'json' # or 'attr:field' or 'attr:field1,field2,field3(...)'
7
+
8
+ attr_accessor :output_include_time, :output_include_tag, :output_data_type
9
+ attr_accessor :add_newline, :field_separator
10
+ attr_accessor :remove_prefix, :default_tag
11
+
12
+ attr_accessor :f_separator
13
+
14
+ def configure(conf)
15
+ super
16
+
17
+ @output_include_time = Fluent::Config.bool_value(conf['output_include_time'])
18
+ @output_include_time = true if @output_include_time.nil?
19
+
20
+ @output_include_tag = Fluent::Config.bool_value(conf['output_include_tag'])
21
+ @output_include_tag = true if @output_include_tag.nil?
22
+
23
+ @output_data_type = conf['output_data_type']
24
+ @output_data_type = 'json' if @output_data_type.nil?
25
+
26
+ @f_separator = case conf['field_separator']
27
+ when 'SPACE' then ' '
28
+ when 'COMMA' then ','
29
+ else "\t"
30
+ end
31
+ @add_newline = Fluent::Config.bool_value(conf['add_newline'])
32
+ if @add_newline.nil?
33
+ @add_newline = true
34
+ end
35
+
36
+ @remove_prefix = conf['remove_prefix']
37
+ if @remove_prefix
38
+ @removed_prefix_string = @remove_prefix + '.'
39
+ @removed_length = @removed_prefix_string.length
40
+ end
41
+ if @output_include_tag and @remove_prefix and @remove_prefix.length > 0
42
+ @default_tag = conf['default_tag']
43
+ if @default_tag.nil? or @default_tag.length < 1
44
+ raise Fluent::ConfigError, "Missing 'default_tag' with output_include_tag and remove_prefix."
45
+ end
46
+ end
47
+
48
+ # default timezone: utc
49
+ if conf['localtime'].nil? and conf['utc'].nil?
50
+ @utc = true
51
+ @localtime = false
52
+ elsif not @localtime and not @utc
53
+ @utc = true
54
+ @localtime = false
55
+ end
56
+ # mix-in default time formatter (or you can overwrite @timef on your own configure)
57
+ @timef = @output_include_time ? Fluent::TimeFormatter.new(@time_format, @localtime) : nil
58
+
59
+ @custom_attributes = []
60
+ if @output_data_type == 'json'
61
+ self.instance_eval {
62
+ def stringify_record(record)
63
+ record.to_json
64
+ end
65
+ }
66
+ elsif @output_data_type =~ /^attr:(.*)$/
67
+ @custom_attributes = $1.split(',')
68
+ if @custom_attributes.size > 1
69
+ self.instance_eval {
70
+ def stringify_record(record)
71
+ @custom_attributes.map{|attr| (record[attr] || 'NULL').to_s}.join(@f_separator)
72
+ end
73
+ }
74
+ elsif @custom_attributes.size == 1
75
+ self.instance_eval {
76
+ def stringify_record(record)
77
+ (record[@custom_attributes[0]] || 'NULL').to_s
78
+ end
79
+ }
80
+ else
81
+ raise Fluent::ConfigError, "Invalid attributes specification: '#{@output_data_type}', needs one or more attributes."
82
+ end
83
+ else
84
+ raise Fluent::ConfigError, "Invalid output_data_type: '#{@output_data_type}'. specify 'json' or 'attr:ATTRIBUTE_NAME' or 'attr:ATTR1,ATTR2,...'"
85
+ end
86
+
87
+ if @output_include_time and @output_include_tag
88
+ if @add_newline and @remove_prefix
89
+ self.instance_eval {
90
+ def format(tag,time,record)
91
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
92
+ tag == @remove_prefix
93
+ tag = tag[@removed_length..-1] || @default_tag
94
+ end
95
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record) + "\n"
96
+ end
97
+ }
98
+ elsif @add_newline
99
+ self.instance_eval {
100
+ def format(tag,time,record)
101
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record) + "\n"
102
+ end
103
+ }
104
+ elsif @remove_prefix
105
+ self.instance_eval {
106
+ def format(tag,time,record)
107
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
108
+ tag == @remove_prefix
109
+ tag = tag[@removed_length..-1] || @default_tag
110
+ end
111
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record)
112
+ end
113
+ }
114
+ else
115
+ self.instance_eval {
116
+ def format(tag,time,record)
117
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record)
118
+ end
119
+ }
120
+ end
121
+ elsif @output_include_time
122
+ if @add_newline
123
+ self.instance_eval {
124
+ def format(tag,time,record);
125
+ @timef.format(time) + @f_separator + stringify_record(record) + "\n"
126
+ end
127
+ }
128
+ else
129
+ self.instance_eval {
130
+ def format(tag,time,record);
131
+ @timef.format(time) + @f_separator + stringify_record(record)
132
+ end
133
+ }
134
+ end
135
+ elsif @output_include_tag
136
+ if @add_newline and @remove_prefix
137
+ self.instance_eval {
138
+ def format(tag,time,record)
139
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
140
+ tag == @remove_prefix
141
+ tag = tag[@removed_length..-1] || @default_tag
142
+ end
143
+ tag + @f_separator + stringify_record(record) + "\n"
144
+ end
145
+ }
146
+ elsif @add_newline
147
+ self.instance_eval {
148
+ def format(tag,time,record)
149
+ tag + @f_separator + stringify_record(record) + "\n"
150
+ end
151
+ }
152
+ elsif @remove_prefix
153
+ self.instance_eval {
154
+ def format(tag,time,record)
155
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
156
+ tag == @remove_prefix
157
+ tag = tag[@removed_length..-1] || @default_tag
158
+ end
159
+ tag + @f_separator + stringify_record(record)
160
+ end
161
+ }
162
+ else
163
+ self.instance_eval {
164
+ def format(tag,time,record)
165
+ tag + @f_separator + stringify_record(record)
166
+ end
167
+ }
168
+ end
169
+ else # without time, tag
170
+ if @add_newline
171
+ self.instance_eval {
172
+ def format(tag,time,record);
173
+ stringify_record(record) + "\n"
174
+ end
175
+ }
176
+ else
177
+ self.instance_eval {
178
+ def format(tag,time,record);
179
+ stringify_record(record)
180
+ end
181
+ }
182
+ end
183
+ end
184
+ end
185
+
186
+ def stringify_record(record)
187
+ record.to_json
188
+ end
189
+
190
+ def format(tag, time, record)
191
+ if tag == @remove_prefix or (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length)
192
+ tag = tag[@removed_length..-1] || @default_tag
193
+ end
194
+ time_str = if @output_include_time
195
+ @timef.format(time) + @f_separator
196
+ else
197
+ ''
198
+ end
199
+ tag_str = if @output_include_tag
200
+ tag + @f_separator
201
+ else
202
+ ''
203
+ end
204
+ time_str + tag_str + stringify_record(record) + "\n"
205
+ end
206
+
207
+ end
@@ -0,0 +1,120 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require_relative 'ext_mixin'
4
+
5
+ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
6
+ Fluent::Plugin.register_output('webhdfs', self)
7
+
8
+ WEBHDFS_VERSION = 'v1'
9
+
10
+ config_set_default :buffer_type, 'memory'
11
+ config_set_default :time_slice_format, '%Y%m%d'
12
+
13
+ config_param :namenode, :string # host:port
14
+ config_param :path, :string
15
+ config_param :username, :string, :default => nil
16
+
17
+ include FluentExt::PlainTextFormatterMixin
18
+ config_set_default :output_include_time, true
19
+ config_set_default :output_include_tag, true
20
+ config_set_default :output_data_type, 'json'
21
+ config_set_default :field_separator, "\t"
22
+ config_set_default :add_newline, true
23
+ config_set_default :remove_prefix, nil
24
+
25
+ def initialize
26
+ super
27
+ require 'net/http'
28
+ require 'time'
29
+ require 'webhdfs'
30
+ end
31
+
32
+ def configure(conf)
33
+ if conf['path']
34
+ if conf['path'].index('%S')
35
+ conf['time_slice_format'] = '%Y%m%d%H%M%S'
36
+ elsif conf['path'].index('%M')
37
+ conf['time_slice_format'] = '%Y%m%d%H%M'
38
+ elsif conf['path'].index('%H')
39
+ conf['time_slice_format'] = '%Y%m%d%H'
40
+ end
41
+ end
42
+
43
+ super
44
+
45
+ unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
46
+ raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_NAME:PORT"
47
+ end
48
+ @namenode_host = $1
49
+ @namenode_port = $2.to_i
50
+ unless @path.index('/') == 0
51
+ raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
52
+ end
53
+ @conn = nil
54
+
55
+ @f_separator = case @field_separator
56
+ when 'SPACE' then ' '
57
+ when 'COMMA' then ','
58
+ else "\t"
59
+ end
60
+
61
+ # path => cached_url
62
+ # @cached_datanode_urls = {}
63
+ @client = WebHDFS::Client.new(@namenode_host, @namenode_port, @username)
64
+ @mutex = Mutex.new
65
+ end
66
+
67
+ def start
68
+ super
69
+
70
+ noerror = false
71
+ begin
72
+ ary = @client.list('/')
73
+ noerror = true
74
+ rescue
75
+ $log.error "webdhfs check request failed!"
76
+ raise
77
+ end
78
+ $log.info "webhdfs connection confirmed: #{@namenode_host}:#{@namenode_port}"
79
+ end
80
+
81
+ def shutdown
82
+ super
83
+ end
84
+
85
+ def record_to_string(record)
86
+ record.to_json
87
+ end
88
+
89
+ def format(tag, time, record)
90
+ time_str = @timef.format(time)
91
+ time_str + @f_separator + tag + @f_separator + record_to_string(record) + @line_end
92
+ end
93
+
94
+ def path_format(chunk_key)
95
+ Time.strptime(chunk_key, @time_slice_format).strftime(@path)
96
+ end
97
+
98
+ # TODO datanode url caching?
99
+
100
+ # TODO check conflictions
101
+
102
+ def send_data(path, data)
103
+ begin
104
+ @client.append(path, data)
105
+ rescue WebHDFS::FileNotFoundError
106
+ @client.create(path, data)
107
+ end
108
+ end
109
+
110
+ def write(chunk)
111
+ hdfs_path = path_format(chunk.key)
112
+ begin
113
+ send_data(hdfs_path, chunk.read)
114
+ rescue
115
+ $log.error "failed to communicate hdfs cluster, path: #{hdfs_path}"
116
+ raise
117
+ end
118
+ hdfs_path
119
+ end
120
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fluent-plugin-webhdfs
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - TAGOMORI Satoshi
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-20 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: fluentd
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: webhdfs
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: fluentd
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: webhdfs
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Fluentd plugin to write data on HDFS over WebHDFS
95
+ email:
96
+ - tagomoris@gmail.com
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - fluent-plugin-webhdfs.gemspec
107
+ - lib/fluent/plugin/ext_mixin.rb
108
+ - lib/fluent/plugin/out_webhdfs.rb
109
+ homepage: https://github.com/tagomoris/fluent-plugin-webhdfs
110
+ licenses: []
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 1.8.21
130
+ signing_key:
131
+ specification_version: 3
132
+ summary: Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting
133
+ test_files: []