logstash-input-csvfile 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/logstash/inputs/csvfile.rb +234 -0
- data/spec/inputs/csvfile_spec.rb +521 -0
- metadata +124 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a3f4dde1199a122dfa627e710320adb46356b423
|
4
|
+
data.tar.gz: 91dfb639e4dbcc82d53297d6ef4b18b359042002
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 671ea579d1cc6ec358c7695f22a59f06f1a67f269389e384c9fc527aa662c4d8c0c5f373c6837f264da040cf177824c1e86e4e5f13fd5126cef0dba75370dd74
|
7
|
+
data.tar.gz: 86321481379fc771b41b7919576624446d41a6add7264e7862357fb71404cdea6c8d3b0327aeca916c6f3932a3866832ee5a136b992296de3b8862ca4c22fb11
|
@@ -0,0 +1,234 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "logstash/inputs/file"
|
3
|
+
require "logstash/namespace"
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
# Subclass of logstash-input-file that parses CSV lines, with support for first-line schemas.
|
7
|
+
# Set first_line_defines_columns => true to enable this behavior.
|
8
|
+
# Statically defined columns are also supported, a la logstash-filter-csv, via the columns param.
|
9
|
+
# first_line_defines_columns => true takes precedence, though.
|
10
|
+
#
|
11
|
+
# Since multiple files may be being read by the same plugin instance, and each can have
|
12
|
+
# a distinct schema, this plugin records a schema per source file (as defined by the
|
13
|
+
# event's path attribute) in a hash. When it receives an event for a file it doesn't
|
14
|
+
# know it reads/parses that file's first line to obtain the schema. This method supports
|
15
|
+
# resuming processing after logstash restarts in mid-file.
|
16
|
+
#
|
17
|
+
# I considered extending logstash-filter-csv for to do this, but felt that the only reliable
|
18
|
+
# way to support streaming csv read was to explicitly read it from the file's schema row
|
19
|
+
# (and cache it so subsequent row performance for that file is good.) Since we cannot count
|
20
|
+
# on a logstash filter having read-access to the file, or even processing events that originate
|
21
|
+
# from files I rejected this approach. By definition, a file input plugin must have read-access
|
22
|
+
# to the file it's sourcing data from.
|
23
|
+
#
|
24
|
+
# This plugin borrows most of its csv parsing logic from logstash-filter-csv.
|
25
|
+
#
|
26
|
+
# This plugin extends logstash-input-file by overriding its decorate method. Note that
|
27
|
+
# logstash-input-plugin 0.0.10, released with Logstash 1.5, doesn't set the event's
|
28
|
+
# path element before calling decorate (which this plugin requires), so gemspec insists
|
29
|
+
# on logstash-input-file 1.1.0
|
30
|
+
#
|
31
|
+
|
32
|
+
class LogStash::Inputs::CSVFile < LogStash::Inputs::File
|
33
|
+
config_name "csvfile"
|
34
|
+
|
35
|
+
# Define a list of column names (in the order they appear in the CSV,
|
36
|
+
# as if it were a header line). If `columns` is not configured, or there
|
37
|
+
# are not enough columns specified, the default column names are
|
38
|
+
# "column1", "column2", etc. In the case that there are more columns
|
39
|
+
# in the data than specified in this column list, extra columns will be auto-numbered:
|
40
|
+
# (e.g. "user_defined_1", "user_defined_2", "column3", "column4", etc.)
|
41
|
+
config :columns, :validate => :array, :default => []
|
42
|
+
|
43
|
+
# Bool flag enables sourcing of column names from first event (line) of each file.
|
44
|
+
# A dynamic alternative to explicitly defining columns in the column attribute
|
45
|
+
config :first_line_defines_columns, :validate => :boolean, :default => false
|
46
|
+
|
47
|
+
# Define the column separator value. If this is not specified, the default
|
48
|
+
# is a comma `,`.
|
49
|
+
# Optional.
|
50
|
+
config :separator, :validate => :string, :default => ","
|
51
|
+
|
52
|
+
# Define the character used to quote CSV fields. If this is not specified
|
53
|
+
# the default is a double quote `"`.
|
54
|
+
# Optional.
|
55
|
+
config :quote_char, :validate => :string, :default => '"'
|
56
|
+
|
57
|
+
# Define target field for placing the data.
|
58
|
+
# Defaults to writing to the root of the event.
|
59
|
+
config :target, :validate => :string
|
60
|
+
|
61
|
+
# The maximum time a csv file's schema can be unused (in hours) before
|
62
|
+
# it is automatically scrubbed to avoid memory leakage.
|
63
|
+
# If an event for that file arrives subsequently the schema will be
|
64
|
+
# reconstituted (albeit with the penalty of schema row re-read from file).
|
65
|
+
#
|
66
|
+
# Cache scrubbing occurs inline only when new new files are detected to minimize
|
67
|
+
# perf impact on most CSV events. Since new file detection time is the only time
|
68
|
+
# the cache actually grows, and we're expecting to pay the schema-read penalty then
|
69
|
+
# anyway, it's an optimal time to scrub.
|
70
|
+
#
|
71
|
+
# 0 disables, but memory will grow. OK if you're routinely restarting logstash.
|
72
|
+
config :max_cached_schema_age_hours, :validate => :number, :default => 24
|
73
|
+
|
74
|
+
# To handle cases where there's other content in the file before the schema row
|
75
|
+
# that you'll want to ignore. For instance, you can skip leading blank lines
|
76
|
+
# before the schema by matching to non-blank lines using "^.+$"
|
77
|
+
# Note that the plugin will still emit events for pre-schema rows, albeit with
|
78
|
+
# no attributes (for blank lines) or default-named attributes (if the pre-schema
|
79
|
+
# lines do parse as valid CSV).
|
80
|
+
config :schema_pattern_to_match, :validate => :string
|
81
|
+
|
82
|
+
# To support testing. Adds attributes to events regarding schema cache behavior.
|
83
|
+
config :add_schema_cache_telemetry_to_event, :validate => :boolean, :default => false
|
84
|
+
|
85
|
+
public
|
86
|
+
def register
|
87
|
+
@fileColumns = Hash.new
|
88
|
+
@schemaTouchedTimes = Hash.new
|
89
|
+
super()
|
90
|
+
|
91
|
+
@logger.warn("schema cache scrubbing disabled. Memory use will grow over time.") if @max_cached_schema_age_hours <= 0
|
92
|
+
end
|
93
|
+
|
94
|
+
def decorate(event)
|
95
|
+
super(event)
|
96
|
+
|
97
|
+
message = event["message"]
|
98
|
+
return if !message
|
99
|
+
|
100
|
+
begin
|
101
|
+
values = CSV.parse_line(message, :col_sep => @separator, :quote_char => @quote_char)
|
102
|
+
return if values.length == 0
|
103
|
+
|
104
|
+
# Get names for the columns.
|
105
|
+
if @first_line_defines_columns
|
106
|
+
@logger.debug? && @logger.debug("handling csv in first_line_defines_columns mode", :message => message, :columns => @columns)
|
107
|
+
cols = getSchemaForFile(event, values)
|
108
|
+
else
|
109
|
+
@logger.debug? && @logger.debug("handling csv in explicitly defined columns mode", :message => message, :columns => @columns)
|
110
|
+
cols = @columns
|
111
|
+
end
|
112
|
+
|
113
|
+
# Determine where to write the new attributes
|
114
|
+
if @target.nil?
|
115
|
+
# Default is to write to the root of the event.
|
116
|
+
dest = event
|
117
|
+
else
|
118
|
+
dest = event[@target] ||= {}
|
119
|
+
end
|
120
|
+
|
121
|
+
# Add the per-column attributes (as long as this isn't the event from the schema defining row)
|
122
|
+
if !event["_csvmetadata"]
|
123
|
+
values.each_index do |i|
|
124
|
+
field_name = cols[i] || "column#{i+1}"
|
125
|
+
dest[field_name] = values[i]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
rescue => e
|
130
|
+
event.tag "_csvparsefailure"
|
131
|
+
@logger.warn("Trouble parsing csv", :message => message, :exception => e)
|
132
|
+
return
|
133
|
+
end # begin
|
134
|
+
end # decorate()
|
135
|
+
|
136
|
+
def getSchemaForFile(event, parsedValues)
|
137
|
+
path = event["path"]
|
138
|
+
if !path
|
139
|
+
@logger.warn("No path in event. Cannot retrieve a schema for this event.")
|
140
|
+
return []
|
141
|
+
end
|
142
|
+
|
143
|
+
@logger.debug? && @logger.debug("Getting schema for file", :path => path)
|
144
|
+
|
145
|
+
schema = getCachedSchemaForFile(path)
|
146
|
+
if schema
|
147
|
+
@logger.debug? && @logger.debug("Using cached schema", :cols => schema)
|
148
|
+
event["_schemacachetelemetry"]="cachedEntryUsed" if @add_schema_cache_telemetry_to_event
|
149
|
+
touchSchema(path)
|
150
|
+
return schema
|
151
|
+
end
|
152
|
+
|
153
|
+
@logger.debug? && @logger.debug("Event from unknown file/schema. Reading schema from that file.", :path => path)
|
154
|
+
|
155
|
+
scrubSchemaCache(event) if @max_cached_schema_age_hours > 0
|
156
|
+
|
157
|
+
csvFileLine = readSchemaLineFromFile(path)
|
158
|
+
if !csvFileLine || csvFileLine.length == 0
|
159
|
+
@logger.warn("No suitable schema row found in file.", :path => path)
|
160
|
+
return []
|
161
|
+
end
|
162
|
+
|
163
|
+
schema = CSV.parse_line(csvFileLine, :col_sep => @separator, :quote_char => @quote_char)
|
164
|
+
addSchemaToCache(path, schema)
|
165
|
+
@logger.debug? && @logger.debug("Schema read from file:", :path => path, :cols => schema)
|
166
|
+
|
167
|
+
if @add_schema_cache_telemetry_to_event
|
168
|
+
event["_schemacachetelemetry"]="newEntryCreated"
|
169
|
+
event["_cache_touch_time"]=Time.now
|
170
|
+
end
|
171
|
+
|
172
|
+
# Special handling for the schema row event: tag _csvmetadata and don't return individual column attributes
|
173
|
+
if @fileColumns[path].join == parsedValues.join
|
174
|
+
@logger.debug? && @logger.debug("Received the schema row event. Tagging w/ _csvmetadata", :message => message)
|
175
|
+
event["_csvmetadata"] = true
|
176
|
+
return []
|
177
|
+
else
|
178
|
+
return schema
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
def getCachedSchemaForFile(path)
|
184
|
+
@fileColumns[path]
|
185
|
+
end
|
186
|
+
|
187
|
+
def addSchemaToCache(path, schema)
|
188
|
+
@fileColumns[path] = schema
|
189
|
+
touchSchema(path)
|
190
|
+
end
|
191
|
+
|
192
|
+
def touchSchema(path)
|
193
|
+
@schemaTouchedTimes[path] = Time.now
|
194
|
+
end
|
195
|
+
|
196
|
+
def readSchemaLineFromFile(path)
|
197
|
+
csvFileLine = ""
|
198
|
+
File.open(path, "r") do |f|
|
199
|
+
while csvFileLine.length == 0 and csvFileLine = f.gets
|
200
|
+
if @schema_pattern_to_match
|
201
|
+
if !csvFileLine.end_with?("\n") or !csvFileLine.match(@schema_pattern_to_match)
|
202
|
+
csvFileLine = ""
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
csvFileLine
|
208
|
+
end
|
209
|
+
|
210
|
+
def scrubSchemaCache(event)
|
211
|
+
@logger.debug? && @logger.debug("Scrubbing schema cache", :size => @fileColumns.length)
|
212
|
+
event["_schemacachetelemetryscrubbedbeforecount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
|
213
|
+
|
214
|
+
expiringFiles = []
|
215
|
+
now = Time.now
|
216
|
+
@schemaTouchedTimes.each do |filename, lastReadTime|
|
217
|
+
if (lastReadTime + (@max_cached_schema_age_hours * 60 * 60)) < now
|
218
|
+
expiringFiles << filename
|
219
|
+
@logger.debug? && @logger.debug("Expiring schema for: ", :file => filename, :lastRead => lastReadTime)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
expiringFiles.each do |filename|
|
224
|
+
@fileColumns.delete(filename)
|
225
|
+
@schemaTouchedTimes.delete(filename)
|
226
|
+
@logger.debug? && @logger.debug("Deleted schema for: ", :file => filename)
|
227
|
+
end
|
228
|
+
|
229
|
+
event["_schemacachetelemetryscrubbedaftercount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
|
230
|
+
@logger.debug? && @logger.debug("Done scrubbing schema cache", :size => @fileColumns.length)
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
end # class LogStash::Inputs::CSVFile
|
@@ -0,0 +1,521 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "logstash/devutils/rspec/spec_helper"
|
4
|
+
require "tempfile"
|
5
|
+
require "stud/temporary"
|
6
|
+
require "logstash/inputs/csvfile"
|
7
|
+
|
8
|
+
describe "inputs/csvfile" do
|
9
|
+
|
10
|
+
delimiter = (LogStash::Environment.windows? ? "\r\n" : "\n")
|
11
|
+
|
12
|
+
#Borrowed this first check from file_spec.rb verbatim to get the pipeline running...
|
13
|
+
it "should starts at the end of an existing file" do
|
14
|
+
tmpfile_path = Stud::Temporary.pathname
|
15
|
+
sincedb_path = Stud::Temporary.pathname
|
16
|
+
|
17
|
+
conf = <<-CONFIG
|
18
|
+
input {
|
19
|
+
file {
|
20
|
+
type => "blah"
|
21
|
+
path => "#{tmpfile_path}"
|
22
|
+
sincedb_path => "#{sincedb_path}"
|
23
|
+
delimiter => "#{delimiter}"
|
24
|
+
}
|
25
|
+
}
|
26
|
+
CONFIG
|
27
|
+
|
28
|
+
File.open(tmpfile_path, "w") do |fd|
|
29
|
+
fd.puts("ignore me 1")
|
30
|
+
fd.puts("ignore me 2")
|
31
|
+
end
|
32
|
+
|
33
|
+
events = input(conf) do |pipeline, queue|
|
34
|
+
|
35
|
+
# at this point the plugins
|
36
|
+
# threads might still be initializing so we cannot know when the
|
37
|
+
# file plugin will have seen the original file, it could see it
|
38
|
+
# after the first(s) hello world appends below, hence the
|
39
|
+
# retry logic.
|
40
|
+
|
41
|
+
events = []
|
42
|
+
|
43
|
+
retries = 0
|
44
|
+
while retries < 20
|
45
|
+
File.open(tmpfile_path, "a") do |fd|
|
46
|
+
fd.puts("hello")
|
47
|
+
fd.puts("world")
|
48
|
+
end
|
49
|
+
|
50
|
+
if queue.size >= 2
|
51
|
+
events = 2.times.collect { queue.pop }
|
52
|
+
break
|
53
|
+
end
|
54
|
+
|
55
|
+
sleep(0.1)
|
56
|
+
retries += 1
|
57
|
+
end
|
58
|
+
|
59
|
+
events
|
60
|
+
end #input block
|
61
|
+
|
62
|
+
insist { events[0]["message"] } == "hello"
|
63
|
+
insist { events[1]["message"] } == "world"
|
64
|
+
end #it
|
65
|
+
|
66
|
+
it "should parse csv columns into event attributes using default column names" do
|
67
|
+
tmpfile_path = Stud::Temporary.pathname
|
68
|
+
sincedb_path = Stud::Temporary.pathname
|
69
|
+
|
70
|
+
conf = <<-CONFIG
|
71
|
+
input {
|
72
|
+
csvfile {
|
73
|
+
path => "#{tmpfile_path}"
|
74
|
+
start_position => "beginning"
|
75
|
+
sincedb_path => "#{sincedb_path}"
|
76
|
+
delimiter => "#{delimiter}"
|
77
|
+
}
|
78
|
+
}
|
79
|
+
CONFIG
|
80
|
+
|
81
|
+
File.open(tmpfile_path, "a") do |fd|
|
82
|
+
fd.puts("first,second,third")
|
83
|
+
fd.puts('"fou,rth","fifth"') #Quoting check
|
84
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
85
|
+
end
|
86
|
+
|
87
|
+
events = input(conf) do |pipeline, queue|
|
88
|
+
3.times.collect { queue.pop }
|
89
|
+
end
|
90
|
+
|
91
|
+
insist { events[0]["column1"] } == "first"
|
92
|
+
insist { events[0]["column2"] } == "second"
|
93
|
+
insist { events[0]["column3"] } == "third"
|
94
|
+
insist { events[1]["column1"] } == "fou,rth" #Not a typo: quoting check
|
95
|
+
insist { events[1]["column2"] } == "fifth"
|
96
|
+
insist { events[2]["column1"] } == "sixth"
|
97
|
+
insist { events[2]["column2"] } == "seventh"
|
98
|
+
insist { events[2]["column3"] } == "eighth"
|
99
|
+
insist { events[2]["column4"] } == "ninth"
|
100
|
+
|
101
|
+
end #it
|
102
|
+
|
103
|
+
it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
|
104
|
+
tmpfile_path = Stud::Temporary.pathname
|
105
|
+
sincedb_path = Stud::Temporary.pathname
|
106
|
+
|
107
|
+
conf = <<-CONFIG
|
108
|
+
input {
|
109
|
+
csvfile {
|
110
|
+
path => "#{tmpfile_path}"
|
111
|
+
start_position => "beginning"
|
112
|
+
sincedb_path => "#{sincedb_path}"
|
113
|
+
delimiter => "#{delimiter}"
|
114
|
+
separator => ";"
|
115
|
+
columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
|
116
|
+
}
|
117
|
+
}
|
118
|
+
CONFIG
|
119
|
+
|
120
|
+
File.open(tmpfile_path, "a") do |fd|
|
121
|
+
fd.puts("first;second;third")
|
122
|
+
fd.puts("fourth;fifth")
|
123
|
+
fd.puts("sixth;sev,enth;eighth;ninth")
|
124
|
+
end
|
125
|
+
|
126
|
+
events = input(conf) do |pipeline, queue|
|
127
|
+
3.times.collect { queue.pop }
|
128
|
+
end
|
129
|
+
|
130
|
+
insist { events[0]["FIRST_COL"] } == "first"
|
131
|
+
insist { events[0]["SECOND_COL"] } == "second"
|
132
|
+
insist { events[0]["THIRD_COL"] } == "third"
|
133
|
+
insist { events[1]["FIRST_COL"] } == "fourth"
|
134
|
+
insist { events[1]["SECOND_COL"] } == "fifth"
|
135
|
+
insist { events[2]["FIRST_COL"] } == "sixth"
|
136
|
+
insist { events[2]["SECOND_COL"] } == "sev,enth"
|
137
|
+
insist { events[2]["THIRD_COL"] } == "eighth"
|
138
|
+
insist { events[2]["column4"] } == "ninth"
|
139
|
+
|
140
|
+
end #it
|
141
|
+
|
142
|
+
it "should parse csv columns into attributes using column names defined on the csv files 0th row with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
|
143
|
+
tmpfile_path = Stud::Temporary.pathname
|
144
|
+
tmpfile2_path = Stud::Temporary.pathname
|
145
|
+
sincedb_path = Stud::Temporary.pathname
|
146
|
+
|
147
|
+
conf = <<-CONFIG
|
148
|
+
input {
|
149
|
+
csvfile {
|
150
|
+
path => "#{tmpfile_path}"
|
151
|
+
path => "#{tmpfile2_path}"
|
152
|
+
start_position => "beginning"
|
153
|
+
sincedb_path => "#{sincedb_path}"
|
154
|
+
delimiter => "#{delimiter}"
|
155
|
+
first_line_defines_columns => true
|
156
|
+
}
|
157
|
+
}
|
158
|
+
CONFIG
|
159
|
+
|
160
|
+
File.open(tmpfile_path, "a") do |fd|
|
161
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
162
|
+
fd.puts("first,second,third")
|
163
|
+
fd.puts("fourth,fifth")
|
164
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
165
|
+
end
|
166
|
+
|
167
|
+
events = input(conf) do |pipeline, queue|
|
168
|
+
4.times.collect { queue.pop }
|
169
|
+
end
|
170
|
+
|
171
|
+
insist { events[0]["_csvmetadata"] } == true
|
172
|
+
insist { events[1]["A_COLUMN"] } == "first"
|
173
|
+
insist { events[1]["B_COLUMN"] } == "second"
|
174
|
+
insist { events[1]["C_COLUMN"] } == "third"
|
175
|
+
insist { events[2]["A_COLUMN"] } == "fourth"
|
176
|
+
insist { events[2]["B_COLUMN"] } == "fifth"
|
177
|
+
insist { events[3]["A_COLUMN"] } == "sixth"
|
178
|
+
insist { events[3]["B_COLUMN"] } == "seventh"
|
179
|
+
insist { events[3]["C_COLUMN"] } == "eighth"
|
180
|
+
insist { events[3]["column4"] } == "ninth"
|
181
|
+
|
182
|
+
File.open(tmpfile2_path, "a") do |fd|
|
183
|
+
fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
|
184
|
+
fd.puts("first,second,third")
|
185
|
+
fd.puts("fourth,fifth")
|
186
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
187
|
+
end
|
188
|
+
|
189
|
+
events = input(conf) do |pipeline, queue|
|
190
|
+
4.times.collect { queue.pop }
|
191
|
+
end
|
192
|
+
|
193
|
+
insist { events[0]["_csvmetadata"] } == true
|
194
|
+
insist { events[1]["D_COLUMN"] } == "first"
|
195
|
+
insist { events[1]["E_COLUMN"] } == "second"
|
196
|
+
insist { events[1]["F_COLUMN"] } == "third"
|
197
|
+
insist { events[2]["D_COLUMN"] } == "fourth"
|
198
|
+
insist { events[2]["E_COLUMN"] } == "fifth"
|
199
|
+
insist { events[3]["D_COLUMN"] } == "sixth"
|
200
|
+
insist { events[3]["E_COLUMN"] } == "seventh"
|
201
|
+
insist { events[3]["F_COLUMN"] } == "eighth"
|
202
|
+
insist { events[3]["column4"] } == "ninth"
|
203
|
+
|
204
|
+
end #it
|
205
|
+
|
206
|
+
it "should parse csv columns into attributes using column names defined on the first file row that matches the schema_pattern_to_match with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
|
207
|
+
tmpfile_path = Stud::Temporary.pathname
|
208
|
+
sincedb_path = Stud::Temporary.pathname
|
209
|
+
|
210
|
+
conf = <<-CONFIG
|
211
|
+
input {
|
212
|
+
csvfile {
|
213
|
+
path => "#{tmpfile_path}"
|
214
|
+
start_position => "beginning"
|
215
|
+
sincedb_path => "#{sincedb_path}"
|
216
|
+
delimiter => "#{delimiter}"
|
217
|
+
first_line_defines_columns => true
|
218
|
+
schema_pattern_to_match => "^.+$"
|
219
|
+
}
|
220
|
+
}
|
221
|
+
CONFIG
|
222
|
+
|
223
|
+
File.open(tmpfile_path, "a") do |fd|
|
224
|
+
fd.puts("")
|
225
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
226
|
+
fd.puts("first,second,third")
|
227
|
+
fd.puts("fourth,fifth")
|
228
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
229
|
+
end
|
230
|
+
|
231
|
+
events = input(conf) do |pipeline, queue|
|
232
|
+
5.times.collect { queue.pop }
|
233
|
+
end
|
234
|
+
|
235
|
+
insist { events[1]["_csvmetadata"] } == true
|
236
|
+
insist { events[2]["A_COLUMN"] } == "first"
|
237
|
+
insist { events[2]["B_COLUMN"] } == "second"
|
238
|
+
insist { events[2]["C_COLUMN"] } == "third"
|
239
|
+
insist { events[3]["A_COLUMN"] } == "fourth"
|
240
|
+
insist { events[3]["B_COLUMN"] } == "fifth"
|
241
|
+
insist { events[4]["A_COLUMN"] } == "sixth"
|
242
|
+
insist { events[4]["B_COLUMN"] } == "seventh"
|
243
|
+
insist { events[4]["C_COLUMN"] } == "eighth"
|
244
|
+
insist { events[4]["column4"] } == "ninth"
|
245
|
+
|
246
|
+
end #it
|
247
|
+
|
248
|
+
it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
|
249
|
+
tmpfile_path = Stud::Temporary.pathname
|
250
|
+
sincedb_path = Stud::Temporary.pathname
|
251
|
+
|
252
|
+
conf = <<-CONFIG
|
253
|
+
input {
|
254
|
+
csvfile {
|
255
|
+
path => "#{tmpfile_path}"
|
256
|
+
start_position => "beginning"
|
257
|
+
sincedb_path => "#{sincedb_path}"
|
258
|
+
delimiter => "#{delimiter}"
|
259
|
+
separator => ";"
|
260
|
+
columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
|
261
|
+
}
|
262
|
+
}
|
263
|
+
CONFIG
|
264
|
+
|
265
|
+
File.open(tmpfile_path, "a") do |fd|
|
266
|
+
fd.puts("first;second;third")
|
267
|
+
fd.puts("fourth;fifth")
|
268
|
+
fd.puts("sixth;sev,enth;eighth;ninth")
|
269
|
+
end
|
270
|
+
|
271
|
+
events = input(conf) do |pipeline, queue|
|
272
|
+
3.times.collect { queue.pop }
|
273
|
+
end
|
274
|
+
|
275
|
+
insist { events[0]["FIRST_COL"] } == "first"
|
276
|
+
insist { events[0]["SECOND_COL"] } == "second"
|
277
|
+
insist { events[0]["THIRD_COL"] } == "third"
|
278
|
+
insist { events[1]["FIRST_COL"] } == "fourth"
|
279
|
+
insist { events[1]["SECOND_COL"] } == "fifth"
|
280
|
+
insist { events[2]["FIRST_COL"] } == "sixth"
|
281
|
+
insist { events[2]["SECOND_COL"] } == "sev,enth"
|
282
|
+
insist { events[2]["THIRD_COL"] } == "eighth"
|
283
|
+
insist { events[2]["column4"] } == "ninth"
|
284
|
+
|
285
|
+
end #it
|
286
|
+
|
287
|
+
it "should cache schemas per file" do
|
288
|
+
tmpfile_path = Stud::Temporary.pathname
|
289
|
+
tmpfile2_path = Stud::Temporary.pathname
|
290
|
+
sincedb_path = Stud::Temporary.pathname
|
291
|
+
|
292
|
+
conf = <<-CONFIG
|
293
|
+
input {
|
294
|
+
csvfile {
|
295
|
+
path => "#{tmpfile_path}"
|
296
|
+
path => "#{tmpfile2_path}"
|
297
|
+
start_position => "beginning"
|
298
|
+
sincedb_path => "#{sincedb_path}"
|
299
|
+
delimiter => "#{delimiter}"
|
300
|
+
first_line_defines_columns => true
|
301
|
+
add_schema_cache_telemetry_to_event => true
|
302
|
+
}
|
303
|
+
}
|
304
|
+
CONFIG
|
305
|
+
|
306
|
+
|
307
|
+
events = input(conf) do |pipeline, queue|
|
308
|
+
File.open(tmpfile_path, "a") do |fd|
|
309
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
310
|
+
fd.puts("first,second,third")
|
311
|
+
end
|
312
|
+
|
313
|
+
sleep 1
|
314
|
+
|
315
|
+
File.open(tmpfile2_path, "a") do |fd|
|
316
|
+
fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
|
317
|
+
fd.puts("1st,2nd,3rd")
|
318
|
+
end
|
319
|
+
|
320
|
+
4.times.collect { queue.pop }
|
321
|
+
end
|
322
|
+
|
323
|
+
insist { events[0]["_csvmetadata"] } == true
|
324
|
+
insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
|
325
|
+
|
326
|
+
insist { events[1]["A_COLUMN"] } == "first"
|
327
|
+
insist { events[1]["B_COLUMN"] } == "second"
|
328
|
+
insist { events[1]["C_COLUMN"] } == "third"
|
329
|
+
insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
|
330
|
+
|
331
|
+
insist { events[2]["_csvmetadata"] } == true
|
332
|
+
insist { events[2]["_schemacachetelemetry"] } == "newEntryCreated"
|
333
|
+
|
334
|
+
insist { events[3]["D_COLUMN"] } == "1st"
|
335
|
+
insist { events[3]["E_COLUMN"] } == "2nd"
|
336
|
+
insist { events[3]["F_COLUMN"] } == "3rd"
|
337
|
+
insist { events[3]["_schemacachetelemetry"] } == "cachedEntryUsed"
|
338
|
+
|
339
|
+
end #it
|
340
|
+
|
341
|
+
it "should resume processing of a csv file after logstash restarts" do
|
342
|
+
tmpfile_path = Stud::Temporary.pathname
|
343
|
+
sincedb_path = Stud::Temporary.pathname
|
344
|
+
|
345
|
+
# Set up to expire cache entries after 10s of being untouched. Request that telemetry be added to the event to make cache usage visible.
|
346
|
+
conf = <<-CONFIG
|
347
|
+
input {
|
348
|
+
csvfile {
|
349
|
+
path => "#{tmpfile_path}"
|
350
|
+
start_position => "beginning"
|
351
|
+
sincedb_path => "#{sincedb_path}"
|
352
|
+
delimiter => "#{delimiter}"
|
353
|
+
first_line_defines_columns => true
|
354
|
+
add_schema_cache_telemetry_to_event => true
|
355
|
+
}
|
356
|
+
}
|
357
|
+
CONFIG
|
358
|
+
|
359
|
+
|
360
|
+
File.open(tmpfile_path, "a") do |fd|
|
361
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
362
|
+
fd.puts("first,second,third")
|
363
|
+
end
|
364
|
+
|
365
|
+
events = input(conf) do |pipeline, queue|
|
366
|
+
2.times.collect { queue.pop }
|
367
|
+
end
|
368
|
+
|
369
|
+
insist { events[0]["_csvmetadata"] } == true
|
370
|
+
insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
|
371
|
+
|
372
|
+
insist { events[1]["A_COLUMN"] } == "first"
|
373
|
+
insist { events[1]["B_COLUMN"] } == "second"
|
374
|
+
insist { events[1]["C_COLUMN"] } == "third"
|
375
|
+
insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
|
376
|
+
|
377
|
+
File.open(tmpfile_path, "a") do |fd|
|
378
|
+
fd.puts("fourth,fifth,sixth")
|
379
|
+
end
|
380
|
+
|
381
|
+
events = input(conf) do |pipeline, queue|
|
382
|
+
1.times.collect { queue.pop }
|
383
|
+
end
|
384
|
+
|
385
|
+
insist { events[0]["A_COLUMN"] } == "fourth"
|
386
|
+
insist { events[0]["B_COLUMN"] } == "fifth"
|
387
|
+
insist { events[0]["C_COLUMN"] } == "sixth"
|
388
|
+
insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
|
389
|
+
|
390
|
+
end #it
|
391
|
+
|
392
|
+
it "should expire schema cache entries if untouched for more than their configured lifetime (10s in this case)" do
|
393
|
+
|
394
|
+
# This was tricky to write. Key points:
|
395
|
+
# - Utilizes a special white-box mode of the plugin that exposes what its doing with its schema cache in telemetry attributes.
|
396
|
+
# - While cache durations are typically in multiple hours, for testing we dial it back to 10s via a small fractional number.
|
397
|
+
# - All the various file IO has to go into the input block
|
398
|
+
# - The queue reads are sprinkled throughout to synchronize the test proc with logstash's file processing.
|
399
|
+
# - Put the insists right after the queue reads to better tie the inputs with the expected outputs.
|
400
|
+
|
401
|
+
puts "\nThe caching test now running will take a while... (~30s)"
|
402
|
+
|
403
|
+
tmpfile_path = Stud::Temporary.pathname
|
404
|
+
tmpfile2_path = Stud::Temporary.pathname
|
405
|
+
tmpfile3_path = Stud::Temporary.pathname
|
406
|
+
sincedb_path = Stud::Temporary.pathname
|
407
|
+
|
408
|
+
conf = <<-CONFIG
|
409
|
+
input {
|
410
|
+
csvfile {
|
411
|
+
path => "#{tmpfile_path}"
|
412
|
+
path => "#{tmpfile2_path}"
|
413
|
+
path => "#{tmpfile3_path}"
|
414
|
+
start_position => "beginning"
|
415
|
+
sincedb_path => "#{sincedb_path}"
|
416
|
+
delimiter => "#{delimiter}"
|
417
|
+
first_line_defines_columns => true
|
418
|
+
max_cached_schema_age_hours => 0.0027777777777778
|
419
|
+
add_schema_cache_telemetry_to_event => true
|
420
|
+
discover_interval => 1
|
421
|
+
}
|
422
|
+
}
|
423
|
+
CONFIG
|
424
|
+
|
425
|
+
events = input(conf) do |pipeline, queue|
|
426
|
+
|
427
|
+
# File1 Initial Entries. File 1's schema will be cached.
|
428
|
+
File.open(tmpfile_path, "a") do |fd|
|
429
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
430
|
+
fd.puts("first,second,third")
|
431
|
+
end
|
432
|
+
# Verify File1 schema was cached and schema row was tagged as csvmetadata
|
433
|
+
event = queue.pop
|
434
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
435
|
+
insist { event["_csvmetadata"] } == true
|
436
|
+
|
437
|
+
# Verify that cached File1 schema was used to decode row2 of File1
|
438
|
+
event = queue.pop
|
439
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
440
|
+
insist { event["A_COLUMN"] } == "first"
|
441
|
+
insist { event["B_COLUMN"] } == "second"
|
442
|
+
insist { event["C_COLUMN"] } == "third"
|
443
|
+
|
444
|
+
# File2 Initial Entries
|
445
|
+
File.open(tmpfile2_path, "a") do |fd|
|
446
|
+
fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
|
447
|
+
fd.puts("1st,2nd,3rd")
|
448
|
+
end
|
449
|
+
# Verify File2 schema was cached and schema row was tagged as csvmetadata
|
450
|
+
event = queue.pop
|
451
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
452
|
+
insist { event["_csvmetadata"] } == true
|
453
|
+
|
454
|
+
# Verify that cached File2 schema was used to decode row2 of File2
|
455
|
+
event = queue.pop
|
456
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
457
|
+
insist { event["D_COLUMN"] } == "1st"
|
458
|
+
insist { event["E_COLUMN"] } == "2nd"
|
459
|
+
insist { event["F_COLUMN"] } == "3rd"
|
460
|
+
|
461
|
+
# Touch File1 before its cached schema entries expires (<10s), refreshing the entry.
|
462
|
+
sleep 5
|
463
|
+
File.open(tmpfile_path, "a") do |fd|
|
464
|
+
fd.puts("fourth,fifth,sixth")
|
465
|
+
end
|
466
|
+
# Verify that still-cached File1 schema was used to decode newly added row of File1
|
467
|
+
event = queue.pop
|
468
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
469
|
+
insist { event["A_COLUMN"] } == "fourth"
|
470
|
+
insist { event["B_COLUMN"] } == "fifth"
|
471
|
+
insist { event["C_COLUMN"] } == "sixth"
|
472
|
+
|
473
|
+
# Touch File1 again after File2's cache entry expires.
|
474
|
+
sleep 10
|
475
|
+
File.open(tmpfile_path, "a") do |fd|
|
476
|
+
fd.puts("seventh,eighth,ninth")
|
477
|
+
end
|
478
|
+
# Verify that File1's entry hasn't expired, by virtue of the previous touch refreshing it.
|
479
|
+
event = queue.pop
|
480
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
481
|
+
insist { event["A_COLUMN"] } == "seventh"
|
482
|
+
insist { event["B_COLUMN"] } == "eighth"
|
483
|
+
insist { event["C_COLUMN"] } == "ninth"
|
484
|
+
|
485
|
+
# Touch File3. Creation of its cache entry forces purge of File2's expired entry, which is made visible via telemetry.
|
486
|
+
sleep 1
|
487
|
+
File.open(tmpfile3_path, "a") do |fd|
|
488
|
+
fd.puts("X_COLUMN,Y_COLUMN,Z_COLUMN")
|
489
|
+
fd.puts("erste,zweite,dritte")
|
490
|
+
end
|
491
|
+
# Verify that scrubbing of expired cache entries takes place, reducing cached count from 2 (File1 & File2) to 1 (Just File1).
|
492
|
+
# (Scrubbing takes place before creation of File3's schema entry in the cache.)
|
493
|
+
event = queue.pop
|
494
|
+
insist { event["_csvmetadata"] } == true
|
495
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
496
|
+
insist { event["_schemacachetelemetryscrubbedbeforecount"] } == 2
|
497
|
+
insist { event["_schemacachetelemetryscrubbedaftercount"] } == 1
|
498
|
+
|
499
|
+
# Verify that File3's schema did in fact get cached.
|
500
|
+
event = queue.pop
|
501
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
502
|
+
insist { event["X_COLUMN"] } == "erste"
|
503
|
+
insist { event["Y_COLUMN"] } == "zweite"
|
504
|
+
insist { event["Z_COLUMN"] } == "dritte"
|
505
|
+
|
506
|
+
# File2 post-expiration entry. Should re-create the File2 cache entry.
|
507
|
+
sleep 1
|
508
|
+
File.open(tmpfile2_path, "a") do |fd|
|
509
|
+
fd.puts("4th,5th,6th")
|
510
|
+
end
|
511
|
+
# Verify that File2's schema gets recreated (but not transmitted as an event since this isn't the natural row0 read).
|
512
|
+
event = queue.pop
|
513
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
514
|
+
insist { event["D_COLUMN"] } == "4th"
|
515
|
+
insist { event["E_COLUMN"] } == "5th"
|
516
|
+
insist { event["F_COLUMN"] } == "6th"
|
517
|
+
|
518
|
+
end #input block
|
519
|
+
end #it
|
520
|
+
|
521
|
+
end
|
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: logstash-input-csvfile
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.6
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- jweite
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-02-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - '>='
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: 1.5.0
|
19
|
+
- - <
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 3.0.0
|
22
|
+
name: logstash-core
|
23
|
+
prerelease: false
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.5.0
|
30
|
+
- - <
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.0.0
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
name: logstash-codec-plain
|
40
|
+
prerelease: false
|
41
|
+
type: :runtime
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.0.1
|
53
|
+
name: logstash-input-file
|
54
|
+
prerelease: false
|
55
|
+
type: :runtime
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.0.1
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
name: stud
|
68
|
+
prerelease: false
|
69
|
+
type: :runtime
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - '>='
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
requirement: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
name: logstash-devutils
|
82
|
+
prerelease: false
|
83
|
+
type: :development
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
|
90
|
+
email: jweite@yahoo.com
|
91
|
+
executables: []
|
92
|
+
extensions: []
|
93
|
+
extra_rdoc_files: []
|
94
|
+
files:
|
95
|
+
- lib/logstash/inputs/csvfile.rb
|
96
|
+
- spec/inputs/csvfile_spec.rb
|
97
|
+
homepage: ''
|
98
|
+
licenses:
|
99
|
+
- Apache License (2.0)
|
100
|
+
metadata:
|
101
|
+
logstash_plugin: 'true'
|
102
|
+
logstash_group: input
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - '>='
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.4.5
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: Extends logstash-input-file to parse csv files, optionally respecting 'first-line schemas'
|
123
|
+
test_files:
|
124
|
+
- spec/inputs/csvfile_spec.rb
|