fluent-plugin-webhdfs 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in fluent-plugin-webhdfs.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2012- TAGOMORI Satoshi
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # Fluent::Plugin::Webhdfs
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'fluent-plugin-webhdfs'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install fluent-plugin-webhdfs
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ Gem::Specification.new do |gem|
3
+ gem.name = "fluent-plugin-webhdfs"
4
+ gem.version = "0.0.1"
5
+ gem.authors = ["TAGOMORI Satoshi"]
6
+ gem.email = ["tagomoris@gmail.com"]
7
+ gem.summary = %q{Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting}
8
+ gem.description = %q{Fluentd plugin to write data on HDFS over WebHDFS}
9
+ gem.homepage = "https://github.com/tagomoris/fluent-plugin-webhdfs"
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.require_paths = ["lib"]
15
+
16
+ gem.add_development_dependency "rake"
17
+ gem.add_development_dependency "fluentd"
18
+ gem.add_development_dependency "webhdfs"
19
+ gem.add_runtime_dependency "fluentd"
20
+ gem.add_runtime_dependency "webhdfs"
21
+ end
@@ -0,0 +1,207 @@
1
+ module FluentExt; end
2
+
3
+ module FluentExt::PlainTextFormatterMixin
4
+ #TODO: tests!
5
+
6
+ # config_param :output_data_type, :string, :default => 'json' # or 'attr:field' or 'attr:field1,field2,field3(...)'
7
+
8
+ attr_accessor :output_include_time, :output_include_tag, :output_data_type
9
+ attr_accessor :add_newline, :field_separator
10
+ attr_accessor :remove_prefix, :default_tag
11
+
12
+ attr_accessor :f_separator
13
+
14
+ def configure(conf)
15
+ super
16
+
17
+ @output_include_time = Fluent::Config.bool_value(conf['output_include_time'])
18
+ @output_include_time = true if @output_include_time.nil?
19
+
20
+ @output_include_tag = Fluent::Config.bool_value(conf['output_include_tag'])
21
+ @output_include_tag = true if @output_include_tag.nil?
22
+
23
+ @output_data_type = conf['output_data_type']
24
+ @output_data_type = 'json' if @output_data_type.nil?
25
+
26
+ @f_separator = case conf['field_separator']
27
+ when 'SPACE' then ' '
28
+ when 'COMMA' then ','
29
+ else "\t"
30
+ end
31
+ @add_newline = Fluent::Config.bool_value(conf['add_newline'])
32
+ if @add_newline.nil?
33
+ @add_newline = true
34
+ end
35
+
36
+ @remove_prefix = conf['remove_prefix']
37
+ if @remove_prefix
38
+ @removed_prefix_string = @remove_prefix + '.'
39
+ @removed_length = @removed_prefix_string.length
40
+ end
41
+ if @output_include_tag and @remove_prefix and @remove_prefix.length > 0
42
+ @default_tag = conf['default_tag']
43
+ if @default_tag.nil? or @default_tag.length < 1
44
+ raise Fluent::ConfigError, "Missing 'default_tag' with output_include_tag and remove_prefix."
45
+ end
46
+ end
47
+
48
+ # default timezone: utc
49
+ if conf['localtime'].nil? and conf['utc'].nil?
50
+ @utc = true
51
+ @localtime = false
52
+ elsif not @localtime and not @utc
53
+ @utc = true
54
+ @localtime = false
55
+ end
56
+ # mix-in default time formatter (or you can overwrite @timef on your own configure)
57
+ @timef = @output_include_time ? Fluent::TimeFormatter.new(@time_format, @localtime) : nil
58
+
59
+ @custom_attributes = []
60
+ if @output_data_type == 'json'
61
+ self.instance_eval {
62
+ def stringify_record(record)
63
+ record.to_json
64
+ end
65
+ }
66
+ elsif @output_data_type =~ /^attr:(.*)$/
67
+ @custom_attributes = $1.split(',')
68
+ if @custom_attributes.size > 1
69
+ self.instance_eval {
70
+ def stringify_record(record)
71
+ @custom_attributes.map{|attr| (record[attr] || 'NULL').to_s}.join(@f_separator)
72
+ end
73
+ }
74
+ elsif @custom_attributes.size == 1
75
+ self.instance_eval {
76
+ def stringify_record(record)
77
+ (record[@custom_attributes[0]] || 'NULL').to_s
78
+ end
79
+ }
80
+ else
81
+ raise Fluent::ConfigError, "Invalid attributes specification: '#{@output_data_type}', needs one or more attributes."
82
+ end
83
+ else
84
+ raise Fluent::ConfigError, "Invalid output_data_type: '#{@output_data_type}'. specify 'json' or 'attr:ATTRIBUTE_NAME' or 'attr:ATTR1,ATTR2,...'"
85
+ end
86
+
87
+ if @output_include_time and @output_include_tag
88
+ if @add_newline and @remove_prefix
89
+ self.instance_eval {
90
+ def format(tag,time,record)
91
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
92
+ tag == @remove_prefix
93
+ tag = tag[@removed_length..-1] || @default_tag
94
+ end
95
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record) + "\n"
96
+ end
97
+ }
98
+ elsif @add_newline
99
+ self.instance_eval {
100
+ def format(tag,time,record)
101
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record) + "\n"
102
+ end
103
+ }
104
+ elsif @remove_prefix
105
+ self.instance_eval {
106
+ def format(tag,time,record)
107
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
108
+ tag == @remove_prefix
109
+ tag = tag[@removed_length..-1] || @default_tag
110
+ end
111
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record)
112
+ end
113
+ }
114
+ else
115
+ self.instance_eval {
116
+ def format(tag,time,record)
117
+ @timef.format(time) + @f_separator + tag + @f_separator + stringify_record(record)
118
+ end
119
+ }
120
+ end
121
+ elsif @output_include_time
122
+ if @add_newline
123
+ self.instance_eval {
124
+ def format(tag,time,record);
125
+ @timef.format(time) + @f_separator + stringify_record(record) + "\n"
126
+ end
127
+ }
128
+ else
129
+ self.instance_eval {
130
+ def format(tag,time,record);
131
+ @timef.format(time) + @f_separator + stringify_record(record)
132
+ end
133
+ }
134
+ end
135
+ elsif @output_include_tag
136
+ if @add_newline and @remove_prefix
137
+ self.instance_eval {
138
+ def format(tag,time,record)
139
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
140
+ tag == @remove_prefix
141
+ tag = tag[@removed_length..-1] || @default_tag
142
+ end
143
+ tag + @f_separator + stringify_record(record) + "\n"
144
+ end
145
+ }
146
+ elsif @add_newline
147
+ self.instance_eval {
148
+ def format(tag,time,record)
149
+ tag + @f_separator + stringify_record(record) + "\n"
150
+ end
151
+ }
152
+ elsif @remove_prefix
153
+ self.instance_eval {
154
+ def format(tag,time,record)
155
+ if (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length) or
156
+ tag == @remove_prefix
157
+ tag = tag[@removed_length..-1] || @default_tag
158
+ end
159
+ tag + @f_separator + stringify_record(record)
160
+ end
161
+ }
162
+ else
163
+ self.instance_eval {
164
+ def format(tag,time,record)
165
+ tag + @f_separator + stringify_record(record)
166
+ end
167
+ }
168
+ end
169
+ else # without time, tag
170
+ if @add_newline
171
+ self.instance_eval {
172
+ def format(tag,time,record);
173
+ stringify_record(record) + "\n"
174
+ end
175
+ }
176
+ else
177
+ self.instance_eval {
178
+ def format(tag,time,record);
179
+ stringify_record(record)
180
+ end
181
+ }
182
+ end
183
+ end
184
+ end
185
+
186
+ def stringify_record(record)
187
+ record.to_json
188
+ end
189
+
190
+ def format(tag, time, record)
191
+ if tag == @remove_prefix or (tag[0, @removed_length] == @removed_prefix_string and tag.length > @removed_length)
192
+ tag = tag[@removed_length..-1] || @default_tag
193
+ end
194
+ time_str = if @output_include_time
195
+ @timef.format(time) + @f_separator
196
+ else
197
+ ''
198
+ end
199
+ tag_str = if @output_include_tag
200
+ tag + @f_separator
201
+ else
202
+ ''
203
+ end
204
+ time_str + tag_str + stringify_record(record) + "\n"
205
+ end
206
+
207
+ end
@@ -0,0 +1,120 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require_relative 'ext_mixin'
4
+
5
+ class Fluent::WebHDFSOutput < Fluent::TimeSlicedOutput
6
+ Fluent::Plugin.register_output('webhdfs', self)
7
+
8
+ WEBHDFS_VERSION = 'v1'
9
+
10
+ config_set_default :buffer_type, 'memory'
11
+ config_set_default :time_slice_format, '%Y%m%d'
12
+
13
+ config_param :namenode, :string # host:port
14
+ config_param :path, :string
15
+ config_param :username, :string, :default => nil
16
+
17
+ include FluentExt::PlainTextFormatterMixin
18
+ config_set_default :output_include_time, true
19
+ config_set_default :output_include_tag, true
20
+ config_set_default :output_data_type, 'json'
21
+ config_set_default :field_separator, "\t"
22
+ config_set_default :add_newline, true
23
+ config_set_default :remove_prefix, nil
24
+
25
+ def initialize
26
+ super
27
+ require 'net/http'
28
+ require 'time'
29
+ require 'webhdfs'
30
+ end
31
+
32
+ def configure(conf)
33
+ if conf['path']
34
+ if conf['path'].index('%S')
35
+ conf['time_slice_format'] = '%Y%m%d%H%M%S'
36
+ elsif conf['path'].index('%M')
37
+ conf['time_slice_format'] = '%Y%m%d%H%M'
38
+ elsif conf['path'].index('%H')
39
+ conf['time_slice_format'] = '%Y%m%d%H'
40
+ end
41
+ end
42
+
43
+ super
44
+
45
+ unless /\A([a-zA-Z0-9][-a-zA-Z0-9.]*):(\d+)\Z/ =~ @namenode
46
+ raise Fluent::ConfigError, "Invalid config value about namenode: '#{@namenode}', needs NAMENODE_NAME:PORT"
47
+ end
48
+ @namenode_host = $1
49
+ @namenode_port = $2.to_i
50
+ unless @path.index('/') == 0
51
+ raise Fluent::ConfigError, "Path on hdfs MUST starts with '/', but '#{@path}'"
52
+ end
53
+ @conn = nil
54
+
55
+ @f_separator = case @field_separator
56
+ when 'SPACE' then ' '
57
+ when 'COMMA' then ','
58
+ else "\t"
59
+ end
60
+
61
+ # path => cached_url
62
+ # @cached_datanode_urls = {}
63
+ @client = WebHDFS::Client.new(@namenode_host, @namenode_port, @username)
64
+ @mutex = Mutex.new
65
+ end
66
+
67
+ def start
68
+ super
69
+
70
+ noerror = false
71
+ begin
72
+ ary = @client.list('/')
73
+ noerror = true
74
+ rescue
75
+ $log.error "webdhfs check request failed!"
76
+ raise
77
+ end
78
+ $log.info "webhdfs connection confirmed: #{@namenode_host}:#{@namenode_port}"
79
+ end
80
+
81
+ def shutdown
82
+ super
83
+ end
84
+
85
+ def record_to_string(record)
86
+ record.to_json
87
+ end
88
+
89
+ def format(tag, time, record)
90
+ time_str = @timef.format(time)
91
+ time_str + @f_separator + tag + @f_separator + record_to_string(record) + @line_end
92
+ end
93
+
94
+ def path_format(chunk_key)
95
+ Time.strptime(chunk_key, @time_slice_format).strftime(@path)
96
+ end
97
+
98
+ # TODO datanode url caching?
99
+
100
+ # TODO check conflictions
101
+
102
+ def send_data(path, data)
103
+ begin
104
+ @client.append(path, data)
105
+ rescue WebHDFS::FileNotFoundError
106
+ @client.create(path, data)
107
+ end
108
+ end
109
+
110
+ def write(chunk)
111
+ hdfs_path = path_format(chunk.key)
112
+ begin
113
+ send_data(hdfs_path, chunk.read)
114
+ rescue
115
+ $log.error "failed to communicate hdfs cluster, path: #{hdfs_path}"
116
+ raise
117
+ end
118
+ hdfs_path
119
+ end
120
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fluent-plugin-webhdfs
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - TAGOMORI Satoshi
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-20 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: fluentd
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: webhdfs
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: fluentd
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :runtime
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: webhdfs
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :runtime
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ description: Fluentd plugin to write data on HDFS over WebHDFS
95
+ email:
96
+ - tagomoris@gmail.com
97
+ executables: []
98
+ extensions: []
99
+ extra_rdoc_files: []
100
+ files:
101
+ - .gitignore
102
+ - Gemfile
103
+ - LICENSE.txt
104
+ - README.md
105
+ - Rakefile
106
+ - fluent-plugin-webhdfs.gemspec
107
+ - lib/fluent/plugin/ext_mixin.rb
108
+ - lib/fluent/plugin/out_webhdfs.rb
109
+ homepage: https://github.com/tagomoris/fluent-plugin-webhdfs
110
+ licenses: []
111
+ post_install_message:
112
+ rdoc_options: []
113
+ require_paths:
114
+ - lib
115
+ required_ruby_version: !ruby/object:Gem::Requirement
116
+ none: false
117
+ requirements:
118
+ - - ! '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ none: false
123
+ requirements:
124
+ - - ! '>='
125
+ - !ruby/object:Gem::Version
126
+ version: '0'
127
+ requirements: []
128
+ rubyforge_project:
129
+ rubygems_version: 1.8.21
130
+ signing_key:
131
+ specification_version: 3
132
+ summary: Fluentd plugin to write data on HDFS over WebHDFS, with flexible formatting
133
+ test_files: []