logstash-filter-xml 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ NjBhYWFkYzM0NjM2ODNkNmQwYTg3YzRkNjc0ZmUzMDFhYjkzNTkzNg==
5
+ data.tar.gz: !binary |-
6
+ ZjM5NWZhMzczOTZkZTRmOTFiZmVjNDNhOWNiNDk2YjJjODc3Y2M3OQ==
7
+ SHA512:
8
+ metadata.gz: !binary |-
9
+ ODNkOTFiNDVhODEyNWI5NzcwYmJiNWU3ZmNjYTJmNDRmMWZkMGY0ZjAwODg1
10
+ ZDhhOGUzYTg5NmYxOTQ0ZjBjZTAzYzFiMDQ2ZjVmYmMwYTcyY2U1YmEzODE1
11
+ Mzc4OTkxY2JiMGI1YmJiMDgzMzIyMDg2ZWU0ZjVjYTdmZTFkNWU=
12
+ data.tar.gz: !binary |-
13
+ MmJlM2M0MTRhNDEwODU1Y2I2OWE5ZGIxYWMxZGZkMTRkMDllNjhlNWJlYTE3
14
+ NzQ4NmU0YTI5ZDkxMzY4NGJkZDgxODc4NTZmMGQ2ODkxMDdmMWU1YTE4N2Fj
15
+ MGMxYmJhN2FmNGMyY2ZhOGZmOGQzMTFlMjUwYmVjMzFkMTEwM2M=
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ Gemfile.lock
3
+ .bundle
4
+ vendor
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'http://rubygems.org'
2
+ gem 'rake'
3
+ gem 'gem_publisher'
4
+ gem 'archive-tar-minitar'
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ @files=[]
2
+
3
+ task :default do
4
+ system("rake -T")
5
+ end
6
+
@@ -0,0 +1,139 @@
1
+ # encoding: utf-8
2
+ require "logstash/filters/base"
3
+ require "logstash/namespace"
4
+
5
+ # XML filter. Takes a field that contains XML and expands it into
6
+ # an actual datastructure.
7
+ class LogStash::Filters::Xml < LogStash::Filters::Base
8
+
9
+ config_name "xml"
10
+ milestone 1
11
+
12
+ # Config for xml to hash is:
13
+ #
14
+ # source => source_field
15
+ #
16
+ # For example, if you have the whole xml document in your @message field:
17
+ #
18
+ # filter {
19
+ # xml {
20
+ # source => "message"
21
+ # }
22
+ # }
23
+ #
24
+ # The above would parse the xml from the @message field
25
+ config :source, :validate => :string
26
+
27
+ # Define target for placing the data
28
+ #
29
+ # for example if you want the data to be put in the 'doc' field:
30
+ #
31
+ # filter {
32
+ # xml {
33
+ # target => "doc"
34
+ # }
35
+ # }
36
+ #
37
+ # XML in the value of the source field will be expanded into a
38
+ # datastructure in the "target" field.
39
+ # Note: if the "target" field already exists, it will be overridden
40
+ # Required
41
+ config :target, :validate => :string
42
+
43
+ # xpath will additionally select string values (.to_s on whatever is selected)
44
+ # from parsed XML (using each source field defined using the method above)
45
+ # and place those values in the destination fields. Configuration:
46
+ #
47
+ # xpath => [ "xpath-syntax", "destination-field" ]
48
+ #
49
+ # Values returned by XPath parsring from xpath-synatx will be put in the
50
+ # destination field. Multiple values returned will be pushed onto the
51
+ # destination field as an array. As such, multiple matches across
52
+ # multiple source fields will produce duplicate entries in the field
53
+ #
54
+ # More on xpath: http://www.w3schools.com/xpath/
55
+ #
56
+ # The xpath functions are particularly powerful:
57
+ # http://www.w3schools.com/xpath/xpath_functions.asp
58
+ #
59
+ config :xpath, :validate => :hash, :default => {}
60
+
61
+ # By default the filter will store the whole parsed xml in the destination
62
+ # field as described above. Setting this to false will prevent that.
63
+ config :store_xml, :validate => :boolean, :default => true
64
+
65
+ public
66
+ def register
67
+ require "nokogiri"
68
+ require "xmlsimple"
69
+
70
+ end # def register
71
+
72
+ public
73
+ def filter(event)
74
+ return unless filter?(event)
75
+ matched = false
76
+
77
+ @logger.debug("Running xml filter", :event => event)
78
+
79
+ return unless event.include?(@source)
80
+
81
+ value = event[@source]
82
+
83
+ if value.is_a?(Array) && value.length > 1
84
+ @logger.warn("XML filter only works on fields of length 1",
85
+ :source => @source, :value => value)
86
+ return
87
+ end
88
+
89
+ # Do nothing with an empty string.
90
+ return if value.strip.length == 0
91
+
92
+ if @xpath
93
+ begin
94
+ doc = Nokogiri::XML(value)
95
+ rescue => e
96
+ event.tag("_xmlparsefailure")
97
+ @logger.warn("Trouble parsing xml", :source => @source, :value => value,
98
+ :exception => e, :backtrace => e.backtrace)
99
+ return
100
+ end
101
+
102
+ @xpath.each do |xpath_src, xpath_dest|
103
+ nodeset = doc.xpath(xpath_src)
104
+
105
+ # If asking xpath for a String, like "name(/*)", we get back a
106
+ # String instead of a NodeSet. We normalize that here.
107
+ normalized_nodeset = nodeset.kind_of?(Nokogiri::XML::NodeSet) ? nodeset : [nodeset]
108
+
109
+ normalized_nodeset.each do |value|
110
+ # some XPath functions return empty arrays as string
111
+ if value.is_a?(Array)
112
+ return if value.length == 0
113
+ end
114
+
115
+ unless value.nil?
116
+ matched = true
117
+ event[xpath_dest] ||= []
118
+ event[xpath_dest] << value.to_s
119
+ end
120
+ end # XPath.each
121
+ end # @xpath.each
122
+ end # if @xpath
123
+
124
+ if @store_xml
125
+ begin
126
+ event[@target] = XmlSimple.xml_in(value)
127
+ matched = true
128
+ rescue => e
129
+ event.tag("_xmlparsefailure")
130
+ @logger.warn("Trouble parsing xml with XmlSimple", :source => @source,
131
+ :value => value, :exception => e, :backtrace => e.backtrace)
132
+ return
133
+ end
134
+ end # if @store_xml
135
+
136
+ filter_matched(event) if matched
137
+ @logger.debug("Event after xml filter", :event => event)
138
+ end # def filter
139
+ end # class LogStash::Filters::Xml
@@ -0,0 +1,27 @@
1
+ Gem::Specification.new do |s|
2
+
3
+ s.name = 'logstash-filter-xml'
4
+ s.version = '0.1.0'
5
+ s.licenses = ['Apache License (2.0)']
6
+ s.summary = "Takes a field that contains XML and expands it into an actual datastructure."
7
+ s.description = "Takes a field that contains XML and expands it into an actual datastructure."
8
+ s.authors = ["Elasticsearch"]
9
+ s.email = 'richard.pijnenburg@elasticsearch.com'
10
+ s.homepage = "http://logstash.net/"
11
+ s.require_paths = ["lib"]
12
+
13
+ # Files
14
+ s.files = `git ls-files`.split($\)
15
+
16
+ # Tests
17
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
18
+
19
+ # Special flag to let us know this is actually a logstash plugin
20
+ s.metadata = { "logstash_plugin" => "true", "group" => "filter" }
21
+
22
+ # Gem dependencies
23
+ s.add_runtime_dependency 'logstash', '>= 1.4.0', '< 2.0.0'
24
+ s.add_runtime_dependency 'nokogiri'
25
+
26
+ end
27
+
@@ -0,0 +1,9 @@
1
+ require "gem_publisher"
2
+
3
+ desc "Publish gem to RubyGems.org"
4
+ task :publish_gem do |t|
5
+ gem_file = Dir.glob(File.expand_path('../*.gemspec',File.dirname(__FILE__))).first
6
+ gem = GemPublisher.publish_if_updated(gem_file, :rubygems)
7
+ puts "Published #{gem}" if gem
8
+ end
9
+
@@ -0,0 +1,169 @@
1
+ require "net/http"
2
+ require "uri"
3
+ require "digest/sha1"
4
+
5
+ def vendor(*args)
6
+ return File.join("vendor", *args)
7
+ end
8
+
9
+ directory "vendor/" => ["vendor"] do |task, args|
10
+ mkdir task.name
11
+ end
12
+
13
+ def fetch(url, sha1, output)
14
+
15
+ puts "Downloading #{url}"
16
+ actual_sha1 = download(url, output)
17
+
18
+ if actual_sha1 != sha1
19
+ fail "SHA1 does not match (expected '#{sha1}' but got '#{actual_sha1}')"
20
+ end
21
+ end # def fetch
22
+
23
+ def file_fetch(url, sha1)
24
+ filename = File.basename( URI(url).path )
25
+ output = "vendor/#{filename}"
26
+ task output => [ "vendor/" ] do
27
+ begin
28
+ actual_sha1 = file_sha1(output)
29
+ if actual_sha1 != sha1
30
+ fetch(url, sha1, output)
31
+ end
32
+ rescue Errno::ENOENT
33
+ fetch(url, sha1, output)
34
+ end
35
+ end.invoke
36
+
37
+ return output
38
+ end
39
+
40
+ def file_sha1(path)
41
+ digest = Digest::SHA1.new
42
+ fd = File.new(path, "r")
43
+ while true
44
+ begin
45
+ digest << fd.sysread(16384)
46
+ rescue EOFError
47
+ break
48
+ end
49
+ end
50
+ return digest.hexdigest
51
+ ensure
52
+ fd.close if fd
53
+ end
54
+
55
+ def download(url, output)
56
+ uri = URI(url)
57
+ digest = Digest::SHA1.new
58
+ tmp = "#{output}.tmp"
59
+ Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == "https")) do |http|
60
+ request = Net::HTTP::Get.new(uri.path)
61
+ http.request(request) do |response|
62
+ fail "HTTP fetch failed for #{url}. #{response}" if [200, 301].include?(response.code)
63
+ size = (response["content-length"].to_i || -1).to_f
64
+ count = 0
65
+ File.open(tmp, "w") do |fd|
66
+ response.read_body do |chunk|
67
+ fd.write(chunk)
68
+ digest << chunk
69
+ if size > 0 && $stdout.tty?
70
+ count += chunk.bytesize
71
+ $stdout.write(sprintf("\r%0.2f%%", count/size * 100))
72
+ end
73
+ end
74
+ end
75
+ $stdout.write("\r \r") if $stdout.tty?
76
+ end
77
+ end
78
+
79
+ File.rename(tmp, output)
80
+
81
+ return digest.hexdigest
82
+ rescue SocketError => e
83
+ puts "Failure while downloading #{url}: #{e}"
84
+ raise
85
+ ensure
86
+ File.unlink(tmp) if File.exist?(tmp)
87
+ end # def download
88
+
89
+ def untar(tarball, &block)
90
+ require "archive/tar/minitar"
91
+ tgz = Zlib::GzipReader.new(File.open(tarball))
92
+ # Pull out typesdb
93
+ tar = Archive::Tar::Minitar::Input.open(tgz)
94
+ tar.each do |entry|
95
+ path = block.call(entry)
96
+ next if path.nil?
97
+ parent = File.dirname(path)
98
+
99
+ mkdir_p parent unless File.directory?(parent)
100
+
101
+ # Skip this file if the output file is the same size
102
+ if entry.directory?
103
+ mkdir path unless File.directory?(path)
104
+ else
105
+ entry_mode = entry.instance_eval { @mode } & 0777
106
+ if File.exists?(path)
107
+ stat = File.stat(path)
108
+ # TODO(sissel): Submit a patch to archive-tar-minitar upstream to
109
+ # expose headers in the entry.
110
+ entry_size = entry.instance_eval { @size }
111
+ # If file sizes are same, skip writing.
112
+ next if stat.size == entry_size && (stat.mode & 0777) == entry_mode
113
+ end
114
+ puts "Extracting #{entry.full_name} from #{tarball} #{entry_mode.to_s(8)}"
115
+ File.open(path, "w") do |fd|
116
+ # eof? check lets us skip empty files. Necessary because the API provided by
117
+ # Archive::Tar::Minitar::Reader::EntryStream only mostly acts like an
118
+ # IO object. Something about empty files in this EntryStream causes
119
+ # IO.copy_stream to throw "can't convert nil into String" on JRuby
120
+ # TODO(sissel): File a bug about this.
121
+ while !entry.eof?
122
+ chunk = entry.read(16384)
123
+ fd.write(chunk)
124
+ end
125
+ #IO.copy_stream(entry, fd)
126
+ end
127
+ File.chmod(entry_mode, path)
128
+ end
129
+ end
130
+ tar.close
131
+ File.unlink(tarball) if File.file?(tarball)
132
+ end # def untar
133
+
134
+ def ungz(file)
135
+
136
+ outpath = file.gsub('.gz', '')
137
+ tgz = Zlib::GzipReader.new(File.open(file))
138
+ begin
139
+ File.open(outpath, "w") do |out|
140
+ IO::copy_stream(tgz, out)
141
+ end
142
+ File.unlink(file)
143
+ rescue
144
+ File.unlink(outpath) if File.file?(outpath)
145
+ raise
146
+ end
147
+ tgz.close
148
+ end
149
+
150
+ desc "Process any vendor files required for this plugin"
151
+ task "vendor" do |task, args|
152
+
153
+ @files.each do |file|
154
+ download = file_fetch(file['url'], file['sha1'])
155
+ if download =~ /.tar.gz/
156
+ prefix = download.gsub('.tar.gz', '').gsub('vendor/', '')
157
+ untar(download) do |entry|
158
+ if !file['files'].nil?
159
+ next unless file['files'].include?(entry.full_name.gsub(prefix, ''))
160
+ out = entry.full_name.split("/").last
161
+ end
162
+ File.join('vendor', out)
163
+ end
164
+ elsif download =~ /.gz/
165
+ ungz(download)
166
+ end
167
+ end
168
+
169
+ end
@@ -0,0 +1,175 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+ require "logstash/filters/xml"
4
+
5
+ describe LogStash::Filters::Xml do
6
+
7
+ describe "parse standard xml (Deprecated checks)" do
8
+ config <<-CONFIG
9
+ filter {
10
+ xml {
11
+ source => "raw"
12
+ target => "data"
13
+ }
14
+ }
15
+ CONFIG
16
+
17
+ sample("raw" => '<foo key="value"/>') do
18
+ insist { subject["tags"] }.nil?
19
+ insist { subject["data"]} == {"key" => "value"}
20
+ end
21
+
22
+ #From parse xml with array as a value
23
+ sample("raw" => '<foo><key>value1</key><key>value2</key></foo>') do
24
+ insist { subject["tags"] }.nil?
25
+ insist { subject["data"]} == {"key" => ["value1", "value2"]}
26
+ end
27
+
28
+ #From parse xml with hash as a value
29
+ sample("raw" => '<foo><key1><key2>value</key2></key1></foo>') do
30
+ insist { subject["tags"] }.nil?
31
+ insist { subject["data"]} == {"key1" => [{"key2" => ["value"]}]}
32
+ end
33
+
34
+ #From bad xml
35
+ sample("raw" => '<foo /') do
36
+ insist { subject["tags"] }.include?("_xmlparsefailure")
37
+ end
38
+ end
39
+
40
+ describe "parse standard xml but do not store (Deprecated checks)" do
41
+ config <<-CONFIG
42
+ filter {
43
+ xml {
44
+ source => "raw"
45
+ target => "data"
46
+ store_xml => false
47
+ }
48
+ }
49
+ CONFIG
50
+
51
+ sample("raw" => '<foo key="value"/>') do
52
+ insist { subject["tags"] }.nil?
53
+ insist { subject["data"]} == nil
54
+ end
55
+ end
56
+
57
+ describe "parse xml and store values with xpath (Deprecated checks)" do
58
+ config <<-CONFIG
59
+ filter {
60
+ xml {
61
+ source => "raw"
62
+ target => "data"
63
+ xpath => [ "/foo/key/text()", "xpath_field" ]
64
+ }
65
+ }
66
+ CONFIG
67
+
68
+ # Single value
69
+ sample("raw" => '<foo><key>value</key></foo>') do
70
+ insist { subject["tags"] }.nil?
71
+ insist { subject["xpath_field"]} == ["value"]
72
+ end
73
+
74
+ #Multiple values
75
+ sample("raw" => '<foo><key>value1</key><key>value2</key></foo>') do
76
+ insist { subject["tags"] }.nil?
77
+ insist { subject["xpath_field"]} == ["value1","value2"]
78
+ end
79
+ end
80
+
81
+ ## New tests
82
+
83
+ describe "parse standard xml" do
84
+ config <<-CONFIG
85
+ filter {
86
+ xml {
87
+ source => "xmldata"
88
+ target => "data"
89
+ }
90
+ }
91
+ CONFIG
92
+
93
+ sample("xmldata" => '<foo key="value"/>') do
94
+ insist { subject["tags"] }.nil?
95
+ insist { subject["data"]} == {"key" => "value"}
96
+ end
97
+
98
+ #From parse xml with array as a value
99
+ sample("xmldata" => '<foo><key>value1</key><key>value2</key></foo>') do
100
+ insist { subject["tags"] }.nil?
101
+ insist { subject["data"]} == {"key" => ["value1", "value2"]}
102
+ end
103
+
104
+ #From parse xml with hash as a value
105
+ sample("xmldata" => '<foo><key1><key2>value</key2></key1></foo>') do
106
+ insist { subject["tags"] }.nil?
107
+ insist { subject["data"]} == {"key1" => [{"key2" => ["value"]}]}
108
+ end
109
+
110
+ #From bad xml
111
+ sample("xmldata" => '<foo /') do
112
+ insist { subject["tags"] }.include?("_xmlparsefailure")
113
+ end
114
+ end
115
+
116
+ describe "parse standard xml but do not store" do
117
+ config <<-CONFIG
118
+ filter {
119
+ xml {
120
+ source => "xmldata"
121
+ target => "data"
122
+ store_xml => false
123
+ }
124
+ }
125
+ CONFIG
126
+
127
+ sample("xmldata" => '<foo key="value"/>') do
128
+ insist { subject["tags"] }.nil?
129
+ insist { subject["data"]} == nil
130
+ end
131
+ end
132
+
133
+ describe "parse xml and store values with xpath" do
134
+ config <<-CONFIG
135
+ filter {
136
+ xml {
137
+ source => "xmldata"
138
+ target => "data"
139
+ xpath => [ "/foo/key/text()", "xpath_field" ]
140
+ }
141
+ }
142
+ CONFIG
143
+
144
+ # Single value
145
+ sample("xmldata" => '<foo><key>value</key></foo>') do
146
+ insist { subject["tags"] }.nil?
147
+ insist { subject["xpath_field"]} == ["value"]
148
+ end
149
+
150
+ #Multiple values
151
+ sample("xmldata" => '<foo><key>value1</key><key>value2</key></foo>') do
152
+ insist { subject["tags"] }.nil?
153
+ insist { subject["xpath_field"]} == ["value1","value2"]
154
+ end
155
+ end
156
+
157
+ describe "parse correctly non ascii content with xpath" do
158
+ config <<-CONFIG
159
+ filter {
160
+ xml {
161
+ source => "xmldata"
162
+ target => "data"
163
+ xpath => [ "/foo/key/text()", "xpath_field" ]
164
+ }
165
+ }
166
+ CONFIG
167
+
168
+ # Single value
169
+ sample("xmldata" => '<foo><key>Français</key></foo>') do
170
+ insist { subject["tags"] }.nil?
171
+ insist { subject["xpath_field"]} == ["Français"]
172
+ end
173
+ end
174
+
175
+ end
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: logstash-filter-xml
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Elasticsearch
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-25 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: logstash
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ! '>='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.4.0
20
+ - - <
21
+ - !ruby/object:Gem::Version
22
+ version: 2.0.0
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.4.0
30
+ - - <
31
+ - !ruby/object:Gem::Version
32
+ version: 2.0.0
33
+ - !ruby/object:Gem::Dependency
34
+ name: nokogiri
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ! '>='
38
+ - !ruby/object:Gem::Version
39
+ version: '0'
40
+ type: :runtime
41
+ prerelease: false
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ! '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ description: Takes a field that contains XML and expands it into an actual datastructure.
48
+ email: richard.pijnenburg@elasticsearch.com
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - .gitignore
54
+ - Gemfile
55
+ - Rakefile
56
+ - lib/logstash/filters/xml.rb
57
+ - logstash-filter-xml.gemspec
58
+ - rakelib/publish.rake
59
+ - rakelib/vendor.rake
60
+ - spec/filters/xml_spec.rb
61
+ homepage: http://logstash.net/
62
+ licenses:
63
+ - Apache License (2.0)
64
+ metadata:
65
+ logstash_plugin: 'true'
66
+ group: filter
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.4.1
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Takes a field that contains XML and expands it into an actual datastructure.
87
+ test_files:
88
+ - spec/filters/xml_spec.rb