logstash-input-csvfile 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a3f4dde1199a122dfa627e710320adb46356b423
4
+ data.tar.gz: 91dfb639e4dbcc82d53297d6ef4b18b359042002
5
+ SHA512:
6
+ metadata.gz: 671ea579d1cc6ec358c7695f22a59f06f1a67f269389e384c9fc527aa662c4d8c0c5f373c6837f264da040cf177824c1e86e4e5f13fd5126cef0dba75370dd74
7
+ data.tar.gz: 86321481379fc771b41b7919576624446d41a6add7264e7862357fb71404cdea6c8d3b0327aeca916c6f3932a3866832ee5a136b992296de3b8862ca4c22fb11
@@ -0,0 +1,234 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/file"
3
+ require "logstash/namespace"
4
+ require "csv"
5
+
6
+ # Subclass of logstash-input-file that parses CSV lines, with support for first-line schemas.
7
+ # Set first_line_defines_columns => true to enable this behavior.
8
+ # Statically defined columns are also supported, a la logstash-filter-csv, via the columns param.
9
+ # first_line_defines_columns => true takes precedence, though.
10
+ #
11
+ # Since multiple files may be being read by the same plugin instance, and each can have
12
+ # a distinct schema, this plugin records a schema per source file (as defined by the
13
+ # event's path attribute) in a hash. When it receives an event for a file it doesn't
14
+ # know it reads/parses that file's first line to obtain the schema. This method supports
15
+ # resuming processing after logstash restarts in mid-file.
16
+ #
17
+ # I considered extending logstash-filter-csv for to do this, but felt that the only reliable
18
+ # way to support streaming csv read was to explicitly read it from the file's schema row
19
+ # (and cache it so subsequent row performance for that file is good.) Since we cannot count
20
+ # on a logstash filter having read-access to the file, or even processing events that originate
21
+ # from files I rejected this approach. By definition, a file input plugin must have read-access
22
+ # to the file it's sourcing data from.
23
+ #
24
+ # This plugin borrows most of its csv parsing logic from logstash-filter-csv.
25
+ #
26
+ # This plugin extends logstash-input-file by overriding its decorate method. Note that
27
+ # logstash-input-plugin 0.0.10, released with Logstash 1.5, doesn't set the event's
28
+ # path element before calling decorate (which this plugin requires), so gemspec insists
29
+ # on logstash-input-file 1.1.0
30
+ #
31
+
32
+ class LogStash::Inputs::CSVFile < LogStash::Inputs::File
33
+ config_name "csvfile"
34
+
35
+ # Define a list of column names (in the order they appear in the CSV,
36
+ # as if it were a header line). If `columns` is not configured, or there
37
+ # are not enough columns specified, the default column names are
38
+ # "column1", "column2", etc. In the case that there are more columns
39
+ # in the data than specified in this column list, extra columns will be auto-numbered:
40
+ # (e.g. "user_defined_1", "user_defined_2", "column3", "column4", etc.)
41
+ config :columns, :validate => :array, :default => []
42
+
43
+ # Bool flag enables sourcing of column names from first event (line) of each file.
44
+ # A dynamic alternative to explicitly defining columns in the column attribute
45
+ config :first_line_defines_columns, :validate => :boolean, :default => false
46
+
47
+ # Define the column separator value. If this is not specified, the default
48
+ # is a comma `,`.
49
+ # Optional.
50
+ config :separator, :validate => :string, :default => ","
51
+
52
+ # Define the character used to quote CSV fields. If this is not specified
53
+ # the default is a double quote `"`.
54
+ # Optional.
55
+ config :quote_char, :validate => :string, :default => '"'
56
+
57
+ # Define target field for placing the data.
58
+ # Defaults to writing to the root of the event.
59
+ config :target, :validate => :string
60
+
61
+ # The maximum time a csv file's schema can be unused (in hours) before
62
+ # it is automatically scrubbed to avoid memory leakage.
63
+ # If an event for that file arrives subsequently the schema will be
64
+ # reconstituted (albeit with the penalty of schema row re-read from file).
65
+ #
66
+ # Cache scrubbing occurs inline only when new new files are detected to minimize
67
+ # perf impact on most CSV events. Since new file detection time is the only time
68
+ # the cache actually grows, and we're expecting to pay the schema-read penalty then
69
+ # anyway, it's an optimal time to scrub.
70
+ #
71
+ # 0 disables, but memory will grow. OK if you're routinely restarting logstash.
72
+ config :max_cached_schema_age_hours, :validate => :number, :default => 24
73
+
74
+ # To handle cases where there's other content in the file before the schema row
75
+ # that you'll want to ignore. For instance, you can skip leading blank lines
76
+ # before the schema by matching to non-blank lines using "^.+$"
77
+ # Note that the plugin will still emit events for pre-schema rows, albeit with
78
+ # no attributes (for blank lines) or default-named attributes (if the pre-schema
79
+ # lines do parse as valid CSV).
80
+ config :schema_pattern_to_match, :validate => :string
81
+
82
+ # To support testing. Adds attributes to events regarding schema cache behavior.
83
+ config :add_schema_cache_telemetry_to_event, :validate => :boolean, :default => false
84
+
85
+ public
86
+ def register
87
+ @fileColumns = Hash.new
88
+ @schemaTouchedTimes = Hash.new
89
+ super()
90
+
91
+ @logger.warn("schema cache scrubbing disabled. Memory use will grow over time.") if @max_cached_schema_age_hours <= 0
92
+ end
93
+
94
+ def decorate(event)
95
+ super(event)
96
+
97
+ message = event["message"]
98
+ return if !message
99
+
100
+ begin
101
+ values = CSV.parse_line(message, :col_sep => @separator, :quote_char => @quote_char)
102
+ return if values.length == 0
103
+
104
+ # Get names for the columns.
105
+ if @first_line_defines_columns
106
+ @logger.debug? && @logger.debug("handling csv in first_line_defines_columns mode", :message => message, :columns => @columns)
107
+ cols = getSchemaForFile(event, values)
108
+ else
109
+ @logger.debug? && @logger.debug("handling csv in explicitly defined columns mode", :message => message, :columns => @columns)
110
+ cols = @columns
111
+ end
112
+
113
+ # Determine where to write the new attributes
114
+ if @target.nil?
115
+ # Default is to write to the root of the event.
116
+ dest = event
117
+ else
118
+ dest = event[@target] ||= {}
119
+ end
120
+
121
+ # Add the per-column attributes (as long as this isn't the event from the schema defining row)
122
+ if !event["_csvmetadata"]
123
+ values.each_index do |i|
124
+ field_name = cols[i] || "column#{i+1}"
125
+ dest[field_name] = values[i]
126
+ end
127
+ end
128
+
129
+ rescue => e
130
+ event.tag "_csvparsefailure"
131
+ @logger.warn("Trouble parsing csv", :message => message, :exception => e)
132
+ return
133
+ end # begin
134
+ end # decorate()
135
+
136
+ def getSchemaForFile(event, parsedValues)
137
+ path = event["path"]
138
+ if !path
139
+ @logger.warn("No path in event. Cannot retrieve a schema for this event.")
140
+ return []
141
+ end
142
+
143
+ @logger.debug? && @logger.debug("Getting schema for file", :path => path)
144
+
145
+ schema = getCachedSchemaForFile(path)
146
+ if schema
147
+ @logger.debug? && @logger.debug("Using cached schema", :cols => schema)
148
+ event["_schemacachetelemetry"]="cachedEntryUsed" if @add_schema_cache_telemetry_to_event
149
+ touchSchema(path)
150
+ return schema
151
+ end
152
+
153
+ @logger.debug? && @logger.debug("Event from unknown file/schema. Reading schema from that file.", :path => path)
154
+
155
+ scrubSchemaCache(event) if @max_cached_schema_age_hours > 0
156
+
157
+ csvFileLine = readSchemaLineFromFile(path)
158
+ if !csvFileLine || csvFileLine.length == 0
159
+ @logger.warn("No suitable schema row found in file.", :path => path)
160
+ return []
161
+ end
162
+
163
+ schema = CSV.parse_line(csvFileLine, :col_sep => @separator, :quote_char => @quote_char)
164
+ addSchemaToCache(path, schema)
165
+ @logger.debug? && @logger.debug("Schema read from file:", :path => path, :cols => schema)
166
+
167
+ if @add_schema_cache_telemetry_to_event
168
+ event["_schemacachetelemetry"]="newEntryCreated"
169
+ event["_cache_touch_time"]=Time.now
170
+ end
171
+
172
+ # Special handling for the schema row event: tag _csvmetadata and don't return individual column attributes
173
+ if @fileColumns[path].join == parsedValues.join
174
+ @logger.debug? && @logger.debug("Received the schema row event. Tagging w/ _csvmetadata", :message => message)
175
+ event["_csvmetadata"] = true
176
+ return []
177
+ else
178
+ return schema
179
+ end
180
+
181
+ end
182
+
183
+ def getCachedSchemaForFile(path)
184
+ @fileColumns[path]
185
+ end
186
+
187
+ def addSchemaToCache(path, schema)
188
+ @fileColumns[path] = schema
189
+ touchSchema(path)
190
+ end
191
+
192
+ def touchSchema(path)
193
+ @schemaTouchedTimes[path] = Time.now
194
+ end
195
+
196
+ def readSchemaLineFromFile(path)
197
+ csvFileLine = ""
198
+ File.open(path, "r") do |f|
199
+ while csvFileLine.length == 0 and csvFileLine = f.gets
200
+ if @schema_pattern_to_match
201
+ if !csvFileLine.end_with?("\n") or !csvFileLine.match(@schema_pattern_to_match)
202
+ csvFileLine = ""
203
+ end
204
+ end
205
+ end
206
+ end
207
+ csvFileLine
208
+ end
209
+
210
+ def scrubSchemaCache(event)
211
+ @logger.debug? && @logger.debug("Scrubbing schema cache", :size => @fileColumns.length)
212
+ event["_schemacachetelemetryscrubbedbeforecount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
213
+
214
+ expiringFiles = []
215
+ now = Time.now
216
+ @schemaTouchedTimes.each do |filename, lastReadTime|
217
+ if (lastReadTime + (@max_cached_schema_age_hours * 60 * 60)) < now
218
+ expiringFiles << filename
219
+ @logger.debug? && @logger.debug("Expiring schema for: ", :file => filename, :lastRead => lastReadTime)
220
+ end
221
+ end
222
+
223
+ expiringFiles.each do |filename|
224
+ @fileColumns.delete(filename)
225
+ @schemaTouchedTimes.delete(filename)
226
+ @logger.debug? && @logger.debug("Deleted schema for: ", :file => filename)
227
+ end
228
+
229
+ event["_schemacachetelemetryscrubbedaftercount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
230
+ @logger.debug? && @logger.debug("Done scrubbing schema cache", :size => @fileColumns.length)
231
+
232
+ end
233
+
234
+ end # class LogStash::Inputs::CSVFile
@@ -0,0 +1,521 @@
1
+ # encoding: utf-8
2
+
3
+ require "logstash/devutils/rspec/spec_helper"
4
+ require "tempfile"
5
+ require "stud/temporary"
6
+ require "logstash/inputs/csvfile"
7
+
8
+ describe "inputs/csvfile" do
9
+
10
+ delimiter = (LogStash::Environment.windows? ? "\r\n" : "\n")
11
+
12
+ #Borrowed this first check from file_spec.rb verbatim to get the pipeline running...
13
+ it "should starts at the end of an existing file" do
14
+ tmpfile_path = Stud::Temporary.pathname
15
+ sincedb_path = Stud::Temporary.pathname
16
+
17
+ conf = <<-CONFIG
18
+ input {
19
+ file {
20
+ type => "blah"
21
+ path => "#{tmpfile_path}"
22
+ sincedb_path => "#{sincedb_path}"
23
+ delimiter => "#{delimiter}"
24
+ }
25
+ }
26
+ CONFIG
27
+
28
+ File.open(tmpfile_path, "w") do |fd|
29
+ fd.puts("ignore me 1")
30
+ fd.puts("ignore me 2")
31
+ end
32
+
33
+ events = input(conf) do |pipeline, queue|
34
+
35
+ # at this point the plugins
36
+ # threads might still be initializing so we cannot know when the
37
+ # file plugin will have seen the original file, it could see it
38
+ # after the first(s) hello world appends below, hence the
39
+ # retry logic.
40
+
41
+ events = []
42
+
43
+ retries = 0
44
+ while retries < 20
45
+ File.open(tmpfile_path, "a") do |fd|
46
+ fd.puts("hello")
47
+ fd.puts("world")
48
+ end
49
+
50
+ if queue.size >= 2
51
+ events = 2.times.collect { queue.pop }
52
+ break
53
+ end
54
+
55
+ sleep(0.1)
56
+ retries += 1
57
+ end
58
+
59
+ events
60
+ end #input block
61
+
62
+ insist { events[0]["message"] } == "hello"
63
+ insist { events[1]["message"] } == "world"
64
+ end #it
65
+
66
+ it "should parse csv columns into event attributes using default column names" do
67
+ tmpfile_path = Stud::Temporary.pathname
68
+ sincedb_path = Stud::Temporary.pathname
69
+
70
+ conf = <<-CONFIG
71
+ input {
72
+ csvfile {
73
+ path => "#{tmpfile_path}"
74
+ start_position => "beginning"
75
+ sincedb_path => "#{sincedb_path}"
76
+ delimiter => "#{delimiter}"
77
+ }
78
+ }
79
+ CONFIG
80
+
81
+ File.open(tmpfile_path, "a") do |fd|
82
+ fd.puts("first,second,third")
83
+ fd.puts('"fou,rth","fifth"') #Quoting check
84
+ fd.puts("sixth,seventh,eighth,ninth")
85
+ end
86
+
87
+ events = input(conf) do |pipeline, queue|
88
+ 3.times.collect { queue.pop }
89
+ end
90
+
91
+ insist { events[0]["column1"] } == "first"
92
+ insist { events[0]["column2"] } == "second"
93
+ insist { events[0]["column3"] } == "third"
94
+ insist { events[1]["column1"] } == "fou,rth" #Not a typo: quoting check
95
+ insist { events[1]["column2"] } == "fifth"
96
+ insist { events[2]["column1"] } == "sixth"
97
+ insist { events[2]["column2"] } == "seventh"
98
+ insist { events[2]["column3"] } == "eighth"
99
+ insist { events[2]["column4"] } == "ninth"
100
+
101
+ end #it
102
+
103
+ it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
104
+ tmpfile_path = Stud::Temporary.pathname
105
+ sincedb_path = Stud::Temporary.pathname
106
+
107
+ conf = <<-CONFIG
108
+ input {
109
+ csvfile {
110
+ path => "#{tmpfile_path}"
111
+ start_position => "beginning"
112
+ sincedb_path => "#{sincedb_path}"
113
+ delimiter => "#{delimiter}"
114
+ separator => ";"
115
+ columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
116
+ }
117
+ }
118
+ CONFIG
119
+
120
+ File.open(tmpfile_path, "a") do |fd|
121
+ fd.puts("first;second;third")
122
+ fd.puts("fourth;fifth")
123
+ fd.puts("sixth;sev,enth;eighth;ninth")
124
+ end
125
+
126
+ events = input(conf) do |pipeline, queue|
127
+ 3.times.collect { queue.pop }
128
+ end
129
+
130
+ insist { events[0]["FIRST_COL"] } == "first"
131
+ insist { events[0]["SECOND_COL"] } == "second"
132
+ insist { events[0]["THIRD_COL"] } == "third"
133
+ insist { events[1]["FIRST_COL"] } == "fourth"
134
+ insist { events[1]["SECOND_COL"] } == "fifth"
135
+ insist { events[2]["FIRST_COL"] } == "sixth"
136
+ insist { events[2]["SECOND_COL"] } == "sev,enth"
137
+ insist { events[2]["THIRD_COL"] } == "eighth"
138
+ insist { events[2]["column4"] } == "ninth"
139
+
140
+ end #it
141
+
142
+ it "should parse csv columns into attributes using column names defined on the csv files 0th row with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
143
+ tmpfile_path = Stud::Temporary.pathname
144
+ tmpfile2_path = Stud::Temporary.pathname
145
+ sincedb_path = Stud::Temporary.pathname
146
+
147
+ conf = <<-CONFIG
148
+ input {
149
+ csvfile {
150
+ path => "#{tmpfile_path}"
151
+ path => "#{tmpfile2_path}"
152
+ start_position => "beginning"
153
+ sincedb_path => "#{sincedb_path}"
154
+ delimiter => "#{delimiter}"
155
+ first_line_defines_columns => true
156
+ }
157
+ }
158
+ CONFIG
159
+
160
+ File.open(tmpfile_path, "a") do |fd|
161
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
162
+ fd.puts("first,second,third")
163
+ fd.puts("fourth,fifth")
164
+ fd.puts("sixth,seventh,eighth,ninth")
165
+ end
166
+
167
+ events = input(conf) do |pipeline, queue|
168
+ 4.times.collect { queue.pop }
169
+ end
170
+
171
+ insist { events[0]["_csvmetadata"] } == true
172
+ insist { events[1]["A_COLUMN"] } == "first"
173
+ insist { events[1]["B_COLUMN"] } == "second"
174
+ insist { events[1]["C_COLUMN"] } == "third"
175
+ insist { events[2]["A_COLUMN"] } == "fourth"
176
+ insist { events[2]["B_COLUMN"] } == "fifth"
177
+ insist { events[3]["A_COLUMN"] } == "sixth"
178
+ insist { events[3]["B_COLUMN"] } == "seventh"
179
+ insist { events[3]["C_COLUMN"] } == "eighth"
180
+ insist { events[3]["column4"] } == "ninth"
181
+
182
+ File.open(tmpfile2_path, "a") do |fd|
183
+ fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
184
+ fd.puts("first,second,third")
185
+ fd.puts("fourth,fifth")
186
+ fd.puts("sixth,seventh,eighth,ninth")
187
+ end
188
+
189
+ events = input(conf) do |pipeline, queue|
190
+ 4.times.collect { queue.pop }
191
+ end
192
+
193
+ insist { events[0]["_csvmetadata"] } == true
194
+ insist { events[1]["D_COLUMN"] } == "first"
195
+ insist { events[1]["E_COLUMN"] } == "second"
196
+ insist { events[1]["F_COLUMN"] } == "third"
197
+ insist { events[2]["D_COLUMN"] } == "fourth"
198
+ insist { events[2]["E_COLUMN"] } == "fifth"
199
+ insist { events[3]["D_COLUMN"] } == "sixth"
200
+ insist { events[3]["E_COLUMN"] } == "seventh"
201
+ insist { events[3]["F_COLUMN"] } == "eighth"
202
+ insist { events[3]["column4"] } == "ninth"
203
+
204
+ end #it
205
+
206
+ it "should parse csv columns into attributes using column names defined on the first file row that matches the schema_pattern_to_match with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
207
+ tmpfile_path = Stud::Temporary.pathname
208
+ sincedb_path = Stud::Temporary.pathname
209
+
210
+ conf = <<-CONFIG
211
+ input {
212
+ csvfile {
213
+ path => "#{tmpfile_path}"
214
+ start_position => "beginning"
215
+ sincedb_path => "#{sincedb_path}"
216
+ delimiter => "#{delimiter}"
217
+ first_line_defines_columns => true
218
+ schema_pattern_to_match => "^.+$"
219
+ }
220
+ }
221
+ CONFIG
222
+
223
+ File.open(tmpfile_path, "a") do |fd|
224
+ fd.puts("")
225
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
226
+ fd.puts("first,second,third")
227
+ fd.puts("fourth,fifth")
228
+ fd.puts("sixth,seventh,eighth,ninth")
229
+ end
230
+
231
+ events = input(conf) do |pipeline, queue|
232
+ 5.times.collect { queue.pop }
233
+ end
234
+
235
+ insist { events[1]["_csvmetadata"] } == true
236
+ insist { events[2]["A_COLUMN"] } == "first"
237
+ insist { events[2]["B_COLUMN"] } == "second"
238
+ insist { events[2]["C_COLUMN"] } == "third"
239
+ insist { events[3]["A_COLUMN"] } == "fourth"
240
+ insist { events[3]["B_COLUMN"] } == "fifth"
241
+ insist { events[4]["A_COLUMN"] } == "sixth"
242
+ insist { events[4]["B_COLUMN"] } == "seventh"
243
+ insist { events[4]["C_COLUMN"] } == "eighth"
244
+ insist { events[4]["column4"] } == "ninth"
245
+
246
+ end #it
247
+
248
+ it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
249
+ tmpfile_path = Stud::Temporary.pathname
250
+ sincedb_path = Stud::Temporary.pathname
251
+
252
+ conf = <<-CONFIG
253
+ input {
254
+ csvfile {
255
+ path => "#{tmpfile_path}"
256
+ start_position => "beginning"
257
+ sincedb_path => "#{sincedb_path}"
258
+ delimiter => "#{delimiter}"
259
+ separator => ";"
260
+ columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
261
+ }
262
+ }
263
+ CONFIG
264
+
265
+ File.open(tmpfile_path, "a") do |fd|
266
+ fd.puts("first;second;third")
267
+ fd.puts("fourth;fifth")
268
+ fd.puts("sixth;sev,enth;eighth;ninth")
269
+ end
270
+
271
+ events = input(conf) do |pipeline, queue|
272
+ 3.times.collect { queue.pop }
273
+ end
274
+
275
+ insist { events[0]["FIRST_COL"] } == "first"
276
+ insist { events[0]["SECOND_COL"] } == "second"
277
+ insist { events[0]["THIRD_COL"] } == "third"
278
+ insist { events[1]["FIRST_COL"] } == "fourth"
279
+ insist { events[1]["SECOND_COL"] } == "fifth"
280
+ insist { events[2]["FIRST_COL"] } == "sixth"
281
+ insist { events[2]["SECOND_COL"] } == "sev,enth"
282
+ insist { events[2]["THIRD_COL"] } == "eighth"
283
+ insist { events[2]["column4"] } == "ninth"
284
+
285
+ end #it
286
+
287
+ it "should cache schemas per file" do
288
+ tmpfile_path = Stud::Temporary.pathname
289
+ tmpfile2_path = Stud::Temporary.pathname
290
+ sincedb_path = Stud::Temporary.pathname
291
+
292
+ conf = <<-CONFIG
293
+ input {
294
+ csvfile {
295
+ path => "#{tmpfile_path}"
296
+ path => "#{tmpfile2_path}"
297
+ start_position => "beginning"
298
+ sincedb_path => "#{sincedb_path}"
299
+ delimiter => "#{delimiter}"
300
+ first_line_defines_columns => true
301
+ add_schema_cache_telemetry_to_event => true
302
+ }
303
+ }
304
+ CONFIG
305
+
306
+
307
+ events = input(conf) do |pipeline, queue|
308
+ File.open(tmpfile_path, "a") do |fd|
309
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
310
+ fd.puts("first,second,third")
311
+ end
312
+
313
+ sleep 1
314
+
315
+ File.open(tmpfile2_path, "a") do |fd|
316
+ fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
317
+ fd.puts("1st,2nd,3rd")
318
+ end
319
+
320
+ 4.times.collect { queue.pop }
321
+ end
322
+
323
+ insist { events[0]["_csvmetadata"] } == true
324
+ insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
325
+
326
+ insist { events[1]["A_COLUMN"] } == "first"
327
+ insist { events[1]["B_COLUMN"] } == "second"
328
+ insist { events[1]["C_COLUMN"] } == "third"
329
+ insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
330
+
331
+ insist { events[2]["_csvmetadata"] } == true
332
+ insist { events[2]["_schemacachetelemetry"] } == "newEntryCreated"
333
+
334
+ insist { events[3]["D_COLUMN"] } == "1st"
335
+ insist { events[3]["E_COLUMN"] } == "2nd"
336
+ insist { events[3]["F_COLUMN"] } == "3rd"
337
+ insist { events[3]["_schemacachetelemetry"] } == "cachedEntryUsed"
338
+
339
+ end #it
340
+
341
+ it "should resume processing of a csv file after logstash restarts" do
342
+ tmpfile_path = Stud::Temporary.pathname
343
+ sincedb_path = Stud::Temporary.pathname
344
+
345
+ # Set up to expire cache entries after 10s of being untouched. Request that telemetry be added to the event to make cache usage visible.
346
+ conf = <<-CONFIG
347
+ input {
348
+ csvfile {
349
+ path => "#{tmpfile_path}"
350
+ start_position => "beginning"
351
+ sincedb_path => "#{sincedb_path}"
352
+ delimiter => "#{delimiter}"
353
+ first_line_defines_columns => true
354
+ add_schema_cache_telemetry_to_event => true
355
+ }
356
+ }
357
+ CONFIG
358
+
359
+
360
+ File.open(tmpfile_path, "a") do |fd|
361
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
362
+ fd.puts("first,second,third")
363
+ end
364
+
365
+ events = input(conf) do |pipeline, queue|
366
+ 2.times.collect { queue.pop }
367
+ end
368
+
369
+ insist { events[0]["_csvmetadata"] } == true
370
+ insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
371
+
372
+ insist { events[1]["A_COLUMN"] } == "first"
373
+ insist { events[1]["B_COLUMN"] } == "second"
374
+ insist { events[1]["C_COLUMN"] } == "third"
375
+ insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
376
+
377
+ File.open(tmpfile_path, "a") do |fd|
378
+ fd.puts("fourth,fifth,sixth")
379
+ end
380
+
381
+ events = input(conf) do |pipeline, queue|
382
+ 1.times.collect { queue.pop }
383
+ end
384
+
385
+ insist { events[0]["A_COLUMN"] } == "fourth"
386
+ insist { events[0]["B_COLUMN"] } == "fifth"
387
+ insist { events[0]["C_COLUMN"] } == "sixth"
388
+ insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
389
+
390
+ end #it
391
+
392
+ it "should expire schema cache entries if untouched for more than their configured lifetime (10s in this case)" do
393
+
394
+ # This was tricky to write. Key points:
395
+ # - Utilizes a special white-box mode of the plugin that exposes what its doing with its schema cache in telemetry attributes.
396
+ # - While cache durations are typically in multiple hours, for testing we dial it back to 10s via a small fractional number.
397
+ # - All the various file IO has to go into the input block
398
+ # - The queue reads are sprinkled throughout to synchronize the test proc with logstash's file processing.
399
+ # - Put the insists right after the queue reads to better tie the inputs with the expected outputs.
400
+
401
+ puts "\nThe caching test now running will take a while... (~30s)"
402
+
403
+ tmpfile_path = Stud::Temporary.pathname
404
+ tmpfile2_path = Stud::Temporary.pathname
405
+ tmpfile3_path = Stud::Temporary.pathname
406
+ sincedb_path = Stud::Temporary.pathname
407
+
408
+ conf = <<-CONFIG
409
+ input {
410
+ csvfile {
411
+ path => "#{tmpfile_path}"
412
+ path => "#{tmpfile2_path}"
413
+ path => "#{tmpfile3_path}"
414
+ start_position => "beginning"
415
+ sincedb_path => "#{sincedb_path}"
416
+ delimiter => "#{delimiter}"
417
+ first_line_defines_columns => true
418
+ max_cached_schema_age_hours => 0.0027777777777778
419
+ add_schema_cache_telemetry_to_event => true
420
+ discover_interval => 1
421
+ }
422
+ }
423
+ CONFIG
424
+
425
+ events = input(conf) do |pipeline, queue|
426
+
427
+ # File1 Initial Entries. File 1's schema will be cached.
428
+ File.open(tmpfile_path, "a") do |fd|
429
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
430
+ fd.puts("first,second,third")
431
+ end
432
+ # Verify File1 schema was cached and schema row was tagged as csvmetadata
433
+ event = queue.pop
434
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
435
+ insist { event["_csvmetadata"] } == true
436
+
437
+ # Verify that cached File1 schema was used to decode row2 of File1
438
+ event = queue.pop
439
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
440
+ insist { event["A_COLUMN"] } == "first"
441
+ insist { event["B_COLUMN"] } == "second"
442
+ insist { event["C_COLUMN"] } == "third"
443
+
444
+ # File2 Initial Entries
445
+ File.open(tmpfile2_path, "a") do |fd|
446
+ fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
447
+ fd.puts("1st,2nd,3rd")
448
+ end
449
+ # Verify File2 schema was cached and schema row was tagged as csvmetadata
450
+ event = queue.pop
451
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
452
+ insist { event["_csvmetadata"] } == true
453
+
454
+ # Verify that cached File2 schema was used to decode row2 of File2
455
+ event = queue.pop
456
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
457
+ insist { event["D_COLUMN"] } == "1st"
458
+ insist { event["E_COLUMN"] } == "2nd"
459
+ insist { event["F_COLUMN"] } == "3rd"
460
+
461
+ # Touch File1 before its cached schema entries expires (<10s), refreshing the entry.
462
+ sleep 5
463
+ File.open(tmpfile_path, "a") do |fd|
464
+ fd.puts("fourth,fifth,sixth")
465
+ end
466
+ # Verify that still-cached File1 schema was used to decode newly added row of File1
467
+ event = queue.pop
468
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
469
+ insist { event["A_COLUMN"] } == "fourth"
470
+ insist { event["B_COLUMN"] } == "fifth"
471
+ insist { event["C_COLUMN"] } == "sixth"
472
+
473
+ # Touch File1 again after File2's cache entry expires.
474
+ sleep 10
475
+ File.open(tmpfile_path, "a") do |fd|
476
+ fd.puts("seventh,eighth,ninth")
477
+ end
478
+ # Verify that File1's entry hasn't expired, by virtue of the previous touch refreshing it.
479
+ event = queue.pop
480
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
481
+ insist { event["A_COLUMN"] } == "seventh"
482
+ insist { event["B_COLUMN"] } == "eighth"
483
+ insist { event["C_COLUMN"] } == "ninth"
484
+
485
+ # Touch File3. Creation of its cache entry forces purge of File2's expired entry, which is made visible via telemetry.
486
+ sleep 1
487
+ File.open(tmpfile3_path, "a") do |fd|
488
+ fd.puts("X_COLUMN,Y_COLUMN,Z_COLUMN")
489
+ fd.puts("erste,zweite,dritte")
490
+ end
491
+ # Verify that scrubbing of expired cache entries takes place, reducing cached count from 2 (File1 & File2) to 1 (Just File1).
492
+ # (Scrubbing takes place before creation of File3's schema entry in the cache.)
493
+ event = queue.pop
494
+ insist { event["_csvmetadata"] } == true
495
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
496
+ insist { event["_schemacachetelemetryscrubbedbeforecount"] } == 2
497
+ insist { event["_schemacachetelemetryscrubbedaftercount"] } == 1
498
+
499
+ # Verify that File3's schema did in fact get cached.
500
+ event = queue.pop
501
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
502
+ insist { event["X_COLUMN"] } == "erste"
503
+ insist { event["Y_COLUMN"] } == "zweite"
504
+ insist { event["Z_COLUMN"] } == "dritte"
505
+
506
+ # File2 post-expiration entry. Should re-create the File2 cache entry.
507
+ sleep 1
508
+ File.open(tmpfile2_path, "a") do |fd|
509
+ fd.puts("4th,5th,6th")
510
+ end
511
+ # Verify that File2's schema gets recreated (but not transmitted as an event since this isn't the natural row0 read).
512
+ event = queue.pop
513
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
514
+ insist { event["D_COLUMN"] } == "4th"
515
+ insist { event["E_COLUMN"] } == "5th"
516
+ insist { event["F_COLUMN"] } == "6th"
517
+
518
+ end #input block
519
+ end #it
520
+
521
+ end
metadata ADDED
@@ -0,0 +1,124 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: logstash-input-csvfile
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ platform: ruby
6
+ authors:
7
+ - jweite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-02-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - '>='
17
+ - !ruby/object:Gem::Version
18
+ version: 1.5.0
19
+ - - <
20
+ - !ruby/object:Gem::Version
21
+ version: 3.0.0
22
+ name: logstash-core
23
+ prerelease: false
24
+ type: :runtime
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.0
30
+ - - <
31
+ - !ruby/object:Gem::Version
32
+ version: 3.0.0
33
+ - !ruby/object:Gem::Dependency
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ name: logstash-codec-plain
40
+ prerelease: false
41
+ type: :runtime
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: 1.0.1
53
+ name: logstash-input-file
54
+ prerelease: false
55
+ type: :runtime
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - '>='
59
+ - !ruby/object:Gem::Version
60
+ version: 1.0.1
61
+ - !ruby/object:Gem::Dependency
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ name: stud
68
+ prerelease: false
69
+ type: :runtime
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ - !ruby/object:Gem::Dependency
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ name: logstash-devutils
82
+ prerelease: false
83
+ type: :development
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
90
+ email: jweite@yahoo.com
91
+ executables: []
92
+ extensions: []
93
+ extra_rdoc_files: []
94
+ files:
95
+ - lib/logstash/inputs/csvfile.rb
96
+ - spec/inputs/csvfile_spec.rb
97
+ homepage: ''
98
+ licenses:
99
+ - Apache License (2.0)
100
+ metadata:
101
+ logstash_plugin: 'true'
102
+ logstash_group: input
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.4.5
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Extends logstash-input-file to parse csv files, optionally respecting 'first-line schemas'
123
+ test_files:
124
+ - spec/inputs/csvfile_spec.rb