logstash-input-csvfile 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a3f4dde1199a122dfa627e710320adb46356b423
4
+ data.tar.gz: 91dfb639e4dbcc82d53297d6ef4b18b359042002
5
+ SHA512:
6
+ metadata.gz: 671ea579d1cc6ec358c7695f22a59f06f1a67f269389e384c9fc527aa662c4d8c0c5f373c6837f264da040cf177824c1e86e4e5f13fd5126cef0dba75370dd74
7
+ data.tar.gz: 86321481379fc771b41b7919576624446d41a6add7264e7862357fb71404cdea6c8d3b0327aeca916c6f3932a3866832ee5a136b992296de3b8862ca4c22fb11
@@ -0,0 +1,234 @@
1
+ # encoding: utf-8
2
+ require "logstash/inputs/file"
3
+ require "logstash/namespace"
4
+ require "csv"
5
+
6
+ # Subclass of logstash-input-file that parses CSV lines, with support for first-line schemas.
7
+ # Set first_line_defines_columns => true to enable this behavior.
8
+ # Statically defined columns are also supported, a la logstash-filter-csv, via the columns param.
9
+ # first_line_defines_columns => true takes precedence, though.
10
+ #
11
+ # Since multiple files may be being read by the same plugin instance, and each can have
12
+ # a distinct schema, this plugin records a schema per source file (as defined by the
13
+ # event's path attribute) in a hash. When it receives an event for a file it doesn't
14
+ # know it reads/parses that file's first line to obtain the schema. This method supports
15
+ # resuming processing after logstash restarts in mid-file.
16
+ #
17
+ # I considered extending logstash-filter-csv for to do this, but felt that the only reliable
18
+ # way to support streaming csv read was to explicitly read it from the file's schema row
19
+ # (and cache it so subsequent row performance for that file is good.) Since we cannot count
20
+ # on a logstash filter having read-access to the file, or even processing events that originate
21
+ # from files I rejected this approach. By definition, a file input plugin must have read-access
22
+ # to the file it's sourcing data from.
23
+ #
24
+ # This plugin borrows most of its csv parsing logic from logstash-filter-csv.
25
+ #
26
+ # This plugin extends logstash-input-file by overriding its decorate method. Note that
27
+ # logstash-input-plugin 0.0.10, released with Logstash 1.5, doesn't set the event's
28
+ # path element before calling decorate (which this plugin requires), so gemspec insists
29
+ # on logstash-input-file 1.1.0
30
+ #
31
+
32
+ class LogStash::Inputs::CSVFile < LogStash::Inputs::File
33
+ config_name "csvfile"
34
+
35
+ # Define a list of column names (in the order they appear in the CSV,
36
+ # as if it were a header line). If `columns` is not configured, or there
37
+ # are not enough columns specified, the default column names are
38
+ # "column1", "column2", etc. In the case that there are more columns
39
+ # in the data than specified in this column list, extra columns will be auto-numbered:
40
+ # (e.g. "user_defined_1", "user_defined_2", "column3", "column4", etc.)
41
+ config :columns, :validate => :array, :default => []
42
+
43
+ # Bool flag enables sourcing of column names from first event (line) of each file.
44
+ # A dynamic alternative to explicitly defining columns in the column attribute
45
+ config :first_line_defines_columns, :validate => :boolean, :default => false
46
+
47
+ # Define the column separator value. If this is not specified, the default
48
+ # is a comma `,`.
49
+ # Optional.
50
+ config :separator, :validate => :string, :default => ","
51
+
52
+ # Define the character used to quote CSV fields. If this is not specified
53
+ # the default is a double quote `"`.
54
+ # Optional.
55
+ config :quote_char, :validate => :string, :default => '"'
56
+
57
+ # Define target field for placing the data.
58
+ # Defaults to writing to the root of the event.
59
+ config :target, :validate => :string
60
+
61
+ # The maximum time a csv file's schema can be unused (in hours) before
62
+ # it is automatically scrubbed to avoid memory leakage.
63
+ # If an event for that file arrives subsequently the schema will be
64
+ # reconstituted (albeit with the penalty of schema row re-read from file).
65
+ #
66
+ # Cache scrubbing occurs inline only when new new files are detected to minimize
67
+ # perf impact on most CSV events. Since new file detection time is the only time
68
+ # the cache actually grows, and we're expecting to pay the schema-read penalty then
69
+ # anyway, it's an optimal time to scrub.
70
+ #
71
+ # 0 disables, but memory will grow. OK if you're routinely restarting logstash.
72
+ config :max_cached_schema_age_hours, :validate => :number, :default => 24
73
+
74
+ # To handle cases where there's other content in the file before the schema row
75
+ # that you'll want to ignore. For instance, you can skip leading blank lines
76
+ # before the schema by matching to non-blank lines using "^.+$"
77
+ # Note that the plugin will still emit events for pre-schema rows, albeit with
78
+ # no attributes (for blank lines) or default-named attributes (if the pre-schema
79
+ # lines do parse as valid CSV).
80
+ config :schema_pattern_to_match, :validate => :string
81
+
82
+ # To support testing. Adds attributes to events regarding schema cache behavior.
83
+ config :add_schema_cache_telemetry_to_event, :validate => :boolean, :default => false
84
+
85
+ public
86
+ def register
87
+ @fileColumns = Hash.new
88
+ @schemaTouchedTimes = Hash.new
89
+ super()
90
+
91
+ @logger.warn("schema cache scrubbing disabled. Memory use will grow over time.") if @max_cached_schema_age_hours <= 0
92
+ end
93
+
94
+ def decorate(event)
95
+ super(event)
96
+
97
+ message = event["message"]
98
+ return if !message
99
+
100
+ begin
101
+ values = CSV.parse_line(message, :col_sep => @separator, :quote_char => @quote_char)
102
+ return if values.length == 0
103
+
104
+ # Get names for the columns.
105
+ if @first_line_defines_columns
106
+ @logger.debug? && @logger.debug("handling csv in first_line_defines_columns mode", :message => message, :columns => @columns)
107
+ cols = getSchemaForFile(event, values)
108
+ else
109
+ @logger.debug? && @logger.debug("handling csv in explicitly defined columns mode", :message => message, :columns => @columns)
110
+ cols = @columns
111
+ end
112
+
113
+ # Determine where to write the new attributes
114
+ if @target.nil?
115
+ # Default is to write to the root of the event.
116
+ dest = event
117
+ else
118
+ dest = event[@target] ||= {}
119
+ end
120
+
121
+ # Add the per-column attributes (as long as this isn't the event from the schema defining row)
122
+ if !event["_csvmetadata"]
123
+ values.each_index do |i|
124
+ field_name = cols[i] || "column#{i+1}"
125
+ dest[field_name] = values[i]
126
+ end
127
+ end
128
+
129
+ rescue => e
130
+ event.tag "_csvparsefailure"
131
+ @logger.warn("Trouble parsing csv", :message => message, :exception => e)
132
+ return
133
+ end # begin
134
+ end # decorate()
135
+
136
+ def getSchemaForFile(event, parsedValues)
137
+ path = event["path"]
138
+ if !path
139
+ @logger.warn("No path in event. Cannot retrieve a schema for this event.")
140
+ return []
141
+ end
142
+
143
+ @logger.debug? && @logger.debug("Getting schema for file", :path => path)
144
+
145
+ schema = getCachedSchemaForFile(path)
146
+ if schema
147
+ @logger.debug? && @logger.debug("Using cached schema", :cols => schema)
148
+ event["_schemacachetelemetry"]="cachedEntryUsed" if @add_schema_cache_telemetry_to_event
149
+ touchSchema(path)
150
+ return schema
151
+ end
152
+
153
+ @logger.debug? && @logger.debug("Event from unknown file/schema. Reading schema from that file.", :path => path)
154
+
155
+ scrubSchemaCache(event) if @max_cached_schema_age_hours > 0
156
+
157
+ csvFileLine = readSchemaLineFromFile(path)
158
+ if !csvFileLine || csvFileLine.length == 0
159
+ @logger.warn("No suitable schema row found in file.", :path => path)
160
+ return []
161
+ end
162
+
163
+ schema = CSV.parse_line(csvFileLine, :col_sep => @separator, :quote_char => @quote_char)
164
+ addSchemaToCache(path, schema)
165
+ @logger.debug? && @logger.debug("Schema read from file:", :path => path, :cols => schema)
166
+
167
+ if @add_schema_cache_telemetry_to_event
168
+ event["_schemacachetelemetry"]="newEntryCreated"
169
+ event["_cache_touch_time"]=Time.now
170
+ end
171
+
172
+ # Special handling for the schema row event: tag _csvmetadata and don't return individual column attributes
173
+ if @fileColumns[path].join == parsedValues.join
174
+ @logger.debug? && @logger.debug("Received the schema row event. Tagging w/ _csvmetadata", :message => message)
175
+ event["_csvmetadata"] = true
176
+ return []
177
+ else
178
+ return schema
179
+ end
180
+
181
+ end
182
+
183
+ def getCachedSchemaForFile(path)
184
+ @fileColumns[path]
185
+ end
186
+
187
+ def addSchemaToCache(path, schema)
188
+ @fileColumns[path] = schema
189
+ touchSchema(path)
190
+ end
191
+
192
+ def touchSchema(path)
193
+ @schemaTouchedTimes[path] = Time.now
194
+ end
195
+
196
+ def readSchemaLineFromFile(path)
197
+ csvFileLine = ""
198
+ File.open(path, "r") do |f|
199
+ while csvFileLine.length == 0 and csvFileLine = f.gets
200
+ if @schema_pattern_to_match
201
+ if !csvFileLine.end_with?("\n") or !csvFileLine.match(@schema_pattern_to_match)
202
+ csvFileLine = ""
203
+ end
204
+ end
205
+ end
206
+ end
207
+ csvFileLine
208
+ end
209
+
210
+ def scrubSchemaCache(event)
211
+ @logger.debug? && @logger.debug("Scrubbing schema cache", :size => @fileColumns.length)
212
+ event["_schemacachetelemetryscrubbedbeforecount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
213
+
214
+ expiringFiles = []
215
+ now = Time.now
216
+ @schemaTouchedTimes.each do |filename, lastReadTime|
217
+ if (lastReadTime + (@max_cached_schema_age_hours * 60 * 60)) < now
218
+ expiringFiles << filename
219
+ @logger.debug? && @logger.debug("Expiring schema for: ", :file => filename, :lastRead => lastReadTime)
220
+ end
221
+ end
222
+
223
+ expiringFiles.each do |filename|
224
+ @fileColumns.delete(filename)
225
+ @schemaTouchedTimes.delete(filename)
226
+ @logger.debug? && @logger.debug("Deleted schema for: ", :file => filename)
227
+ end
228
+
229
+ event["_schemacachetelemetryscrubbedaftercount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
230
+ @logger.debug? && @logger.debug("Done scrubbing schema cache", :size => @fileColumns.length)
231
+
232
+ end
233
+
234
+ end # class LogStash::Inputs::CSVFile
@@ -0,0 +1,521 @@
1
+ # encoding: utf-8
2
+
3
+ require "logstash/devutils/rspec/spec_helper"
4
+ require "tempfile"
5
+ require "stud/temporary"
6
+ require "logstash/inputs/csvfile"
7
+
8
+ describe "inputs/csvfile" do
9
+
10
+ delimiter = (LogStash::Environment.windows? ? "\r\n" : "\n")
11
+
12
+ #Borrowed this first check from file_spec.rb verbatim to get the pipeline running...
13
+ it "should starts at the end of an existing file" do
14
+ tmpfile_path = Stud::Temporary.pathname
15
+ sincedb_path = Stud::Temporary.pathname
16
+
17
+ conf = <<-CONFIG
18
+ input {
19
+ file {
20
+ type => "blah"
21
+ path => "#{tmpfile_path}"
22
+ sincedb_path => "#{sincedb_path}"
23
+ delimiter => "#{delimiter}"
24
+ }
25
+ }
26
+ CONFIG
27
+
28
+ File.open(tmpfile_path, "w") do |fd|
29
+ fd.puts("ignore me 1")
30
+ fd.puts("ignore me 2")
31
+ end
32
+
33
+ events = input(conf) do |pipeline, queue|
34
+
35
+ # at this point the plugins
36
+ # threads might still be initializing so we cannot know when the
37
+ # file plugin will have seen the original file, it could see it
38
+ # after the first(s) hello world appends below, hence the
39
+ # retry logic.
40
+
41
+ events = []
42
+
43
+ retries = 0
44
+ while retries < 20
45
+ File.open(tmpfile_path, "a") do |fd|
46
+ fd.puts("hello")
47
+ fd.puts("world")
48
+ end
49
+
50
+ if queue.size >= 2
51
+ events = 2.times.collect { queue.pop }
52
+ break
53
+ end
54
+
55
+ sleep(0.1)
56
+ retries += 1
57
+ end
58
+
59
+ events
60
+ end #input block
61
+
62
+ insist { events[0]["message"] } == "hello"
63
+ insist { events[1]["message"] } == "world"
64
+ end #it
65
+
66
+ it "should parse csv columns into event attributes using default column names" do
67
+ tmpfile_path = Stud::Temporary.pathname
68
+ sincedb_path = Stud::Temporary.pathname
69
+
70
+ conf = <<-CONFIG
71
+ input {
72
+ csvfile {
73
+ path => "#{tmpfile_path}"
74
+ start_position => "beginning"
75
+ sincedb_path => "#{sincedb_path}"
76
+ delimiter => "#{delimiter}"
77
+ }
78
+ }
79
+ CONFIG
80
+
81
+ File.open(tmpfile_path, "a") do |fd|
82
+ fd.puts("first,second,third")
83
+ fd.puts('"fou,rth","fifth"') #Quoting check
84
+ fd.puts("sixth,seventh,eighth,ninth")
85
+ end
86
+
87
+ events = input(conf) do |pipeline, queue|
88
+ 3.times.collect { queue.pop }
89
+ end
90
+
91
+ insist { events[0]["column1"] } == "first"
92
+ insist { events[0]["column2"] } == "second"
93
+ insist { events[0]["column3"] } == "third"
94
+ insist { events[1]["column1"] } == "fou,rth" #Not a typo: quoting check
95
+ insist { events[1]["column2"] } == "fifth"
96
+ insist { events[2]["column1"] } == "sixth"
97
+ insist { events[2]["column2"] } == "seventh"
98
+ insist { events[2]["column3"] } == "eighth"
99
+ insist { events[2]["column4"] } == "ninth"
100
+
101
+ end #it
102
+
103
+ it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
104
+ tmpfile_path = Stud::Temporary.pathname
105
+ sincedb_path = Stud::Temporary.pathname
106
+
107
+ conf = <<-CONFIG
108
+ input {
109
+ csvfile {
110
+ path => "#{tmpfile_path}"
111
+ start_position => "beginning"
112
+ sincedb_path => "#{sincedb_path}"
113
+ delimiter => "#{delimiter}"
114
+ separator => ";"
115
+ columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
116
+ }
117
+ }
118
+ CONFIG
119
+
120
+ File.open(tmpfile_path, "a") do |fd|
121
+ fd.puts("first;second;third")
122
+ fd.puts("fourth;fifth")
123
+ fd.puts("sixth;sev,enth;eighth;ninth")
124
+ end
125
+
126
+ events = input(conf) do |pipeline, queue|
127
+ 3.times.collect { queue.pop }
128
+ end
129
+
130
+ insist { events[0]["FIRST_COL"] } == "first"
131
+ insist { events[0]["SECOND_COL"] } == "second"
132
+ insist { events[0]["THIRD_COL"] } == "third"
133
+ insist { events[1]["FIRST_COL"] } == "fourth"
134
+ insist { events[1]["SECOND_COL"] } == "fifth"
135
+ insist { events[2]["FIRST_COL"] } == "sixth"
136
+ insist { events[2]["SECOND_COL"] } == "sev,enth"
137
+ insist { events[2]["THIRD_COL"] } == "eighth"
138
+ insist { events[2]["column4"] } == "ninth"
139
+
140
+ end #it
141
+
142
+ it "should parse csv columns into attributes using column names defined on the csv files 0th row with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
143
+ tmpfile_path = Stud::Temporary.pathname
144
+ tmpfile2_path = Stud::Temporary.pathname
145
+ sincedb_path = Stud::Temporary.pathname
146
+
147
+ conf = <<-CONFIG
148
+ input {
149
+ csvfile {
150
+ path => "#{tmpfile_path}"
151
+ path => "#{tmpfile2_path}"
152
+ start_position => "beginning"
153
+ sincedb_path => "#{sincedb_path}"
154
+ delimiter => "#{delimiter}"
155
+ first_line_defines_columns => true
156
+ }
157
+ }
158
+ CONFIG
159
+
160
+ File.open(tmpfile_path, "a") do |fd|
161
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
162
+ fd.puts("first,second,third")
163
+ fd.puts("fourth,fifth")
164
+ fd.puts("sixth,seventh,eighth,ninth")
165
+ end
166
+
167
+ events = input(conf) do |pipeline, queue|
168
+ 4.times.collect { queue.pop }
169
+ end
170
+
171
+ insist { events[0]["_csvmetadata"] } == true
172
+ insist { events[1]["A_COLUMN"] } == "first"
173
+ insist { events[1]["B_COLUMN"] } == "second"
174
+ insist { events[1]["C_COLUMN"] } == "third"
175
+ insist { events[2]["A_COLUMN"] } == "fourth"
176
+ insist { events[2]["B_COLUMN"] } == "fifth"
177
+ insist { events[3]["A_COLUMN"] } == "sixth"
178
+ insist { events[3]["B_COLUMN"] } == "seventh"
179
+ insist { events[3]["C_COLUMN"] } == "eighth"
180
+ insist { events[3]["column4"] } == "ninth"
181
+
182
+ File.open(tmpfile2_path, "a") do |fd|
183
+ fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
184
+ fd.puts("first,second,third")
185
+ fd.puts("fourth,fifth")
186
+ fd.puts("sixth,seventh,eighth,ninth")
187
+ end
188
+
189
+ events = input(conf) do |pipeline, queue|
190
+ 4.times.collect { queue.pop }
191
+ end
192
+
193
+ insist { events[0]["_csvmetadata"] } == true
194
+ insist { events[1]["D_COLUMN"] } == "first"
195
+ insist { events[1]["E_COLUMN"] } == "second"
196
+ insist { events[1]["F_COLUMN"] } == "third"
197
+ insist { events[2]["D_COLUMN"] } == "fourth"
198
+ insist { events[2]["E_COLUMN"] } == "fifth"
199
+ insist { events[3]["D_COLUMN"] } == "sixth"
200
+ insist { events[3]["E_COLUMN"] } == "seventh"
201
+ insist { events[3]["F_COLUMN"] } == "eighth"
202
+ insist { events[3]["column4"] } == "ninth"
203
+
204
+ end #it
205
+
206
+ it "should parse csv columns into attributes using column names defined on the first file row that matches the schema_pattern_to_match with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
207
+ tmpfile_path = Stud::Temporary.pathname
208
+ sincedb_path = Stud::Temporary.pathname
209
+
210
+ conf = <<-CONFIG
211
+ input {
212
+ csvfile {
213
+ path => "#{tmpfile_path}"
214
+ start_position => "beginning"
215
+ sincedb_path => "#{sincedb_path}"
216
+ delimiter => "#{delimiter}"
217
+ first_line_defines_columns => true
218
+ schema_pattern_to_match => "^.+$"
219
+ }
220
+ }
221
+ CONFIG
222
+
223
+ File.open(tmpfile_path, "a") do |fd|
224
+ fd.puts("")
225
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
226
+ fd.puts("first,second,third")
227
+ fd.puts("fourth,fifth")
228
+ fd.puts("sixth,seventh,eighth,ninth")
229
+ end
230
+
231
+ events = input(conf) do |pipeline, queue|
232
+ 5.times.collect { queue.pop }
233
+ end
234
+
235
+ insist { events[1]["_csvmetadata"] } == true
236
+ insist { events[2]["A_COLUMN"] } == "first"
237
+ insist { events[2]["B_COLUMN"] } == "second"
238
+ insist { events[2]["C_COLUMN"] } == "third"
239
+ insist { events[3]["A_COLUMN"] } == "fourth"
240
+ insist { events[3]["B_COLUMN"] } == "fifth"
241
+ insist { events[4]["A_COLUMN"] } == "sixth"
242
+ insist { events[4]["B_COLUMN"] } == "seventh"
243
+ insist { events[4]["C_COLUMN"] } == "eighth"
244
+ insist { events[4]["column4"] } == "ninth"
245
+
246
+ end #it
247
+
248
+ it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
249
+ tmpfile_path = Stud::Temporary.pathname
250
+ sincedb_path = Stud::Temporary.pathname
251
+
252
+ conf = <<-CONFIG
253
+ input {
254
+ csvfile {
255
+ path => "#{tmpfile_path}"
256
+ start_position => "beginning"
257
+ sincedb_path => "#{sincedb_path}"
258
+ delimiter => "#{delimiter}"
259
+ separator => ";"
260
+ columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
261
+ }
262
+ }
263
+ CONFIG
264
+
265
+ File.open(tmpfile_path, "a") do |fd|
266
+ fd.puts("first;second;third")
267
+ fd.puts("fourth;fifth")
268
+ fd.puts("sixth;sev,enth;eighth;ninth")
269
+ end
270
+
271
+ events = input(conf) do |pipeline, queue|
272
+ 3.times.collect { queue.pop }
273
+ end
274
+
275
+ insist { events[0]["FIRST_COL"] } == "first"
276
+ insist { events[0]["SECOND_COL"] } == "second"
277
+ insist { events[0]["THIRD_COL"] } == "third"
278
+ insist { events[1]["FIRST_COL"] } == "fourth"
279
+ insist { events[1]["SECOND_COL"] } == "fifth"
280
+ insist { events[2]["FIRST_COL"] } == "sixth"
281
+ insist { events[2]["SECOND_COL"] } == "sev,enth"
282
+ insist { events[2]["THIRD_COL"] } == "eighth"
283
+ insist { events[2]["column4"] } == "ninth"
284
+
285
+ end #it
286
+
287
+ it "should cache schemas per file" do
288
+ tmpfile_path = Stud::Temporary.pathname
289
+ tmpfile2_path = Stud::Temporary.pathname
290
+ sincedb_path = Stud::Temporary.pathname
291
+
292
+ conf = <<-CONFIG
293
+ input {
294
+ csvfile {
295
+ path => "#{tmpfile_path}"
296
+ path => "#{tmpfile2_path}"
297
+ start_position => "beginning"
298
+ sincedb_path => "#{sincedb_path}"
299
+ delimiter => "#{delimiter}"
300
+ first_line_defines_columns => true
301
+ add_schema_cache_telemetry_to_event => true
302
+ }
303
+ }
304
+ CONFIG
305
+
306
+
307
+ events = input(conf) do |pipeline, queue|
308
+ File.open(tmpfile_path, "a") do |fd|
309
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
310
+ fd.puts("first,second,third")
311
+ end
312
+
313
+ sleep 1
314
+
315
+ File.open(tmpfile2_path, "a") do |fd|
316
+ fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
317
+ fd.puts("1st,2nd,3rd")
318
+ end
319
+
320
+ 4.times.collect { queue.pop }
321
+ end
322
+
323
+ insist { events[0]["_csvmetadata"] } == true
324
+ insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
325
+
326
+ insist { events[1]["A_COLUMN"] } == "first"
327
+ insist { events[1]["B_COLUMN"] } == "second"
328
+ insist { events[1]["C_COLUMN"] } == "third"
329
+ insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
330
+
331
+ insist { events[2]["_csvmetadata"] } == true
332
+ insist { events[2]["_schemacachetelemetry"] } == "newEntryCreated"
333
+
334
+ insist { events[3]["D_COLUMN"] } == "1st"
335
+ insist { events[3]["E_COLUMN"] } == "2nd"
336
+ insist { events[3]["F_COLUMN"] } == "3rd"
337
+ insist { events[3]["_schemacachetelemetry"] } == "cachedEntryUsed"
338
+
339
+ end #it
340
+
341
+ it "should resume processing of a csv file after logstash restarts" do
342
+ tmpfile_path = Stud::Temporary.pathname
343
+ sincedb_path = Stud::Temporary.pathname
344
+
345
+ # Set up to expire cache entries after 10s of being untouched. Request that telemetry be added to the event to make cache usage visible.
346
+ conf = <<-CONFIG
347
+ input {
348
+ csvfile {
349
+ path => "#{tmpfile_path}"
350
+ start_position => "beginning"
351
+ sincedb_path => "#{sincedb_path}"
352
+ delimiter => "#{delimiter}"
353
+ first_line_defines_columns => true
354
+ add_schema_cache_telemetry_to_event => true
355
+ }
356
+ }
357
+ CONFIG
358
+
359
+
360
+ File.open(tmpfile_path, "a") do |fd|
361
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
362
+ fd.puts("first,second,third")
363
+ end
364
+
365
+ events = input(conf) do |pipeline, queue|
366
+ 2.times.collect { queue.pop }
367
+ end
368
+
369
+ insist { events[0]["_csvmetadata"] } == true
370
+ insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
371
+
372
+ insist { events[1]["A_COLUMN"] } == "first"
373
+ insist { events[1]["B_COLUMN"] } == "second"
374
+ insist { events[1]["C_COLUMN"] } == "third"
375
+ insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
376
+
377
+ File.open(tmpfile_path, "a") do |fd|
378
+ fd.puts("fourth,fifth,sixth")
379
+ end
380
+
381
+ events = input(conf) do |pipeline, queue|
382
+ 1.times.collect { queue.pop }
383
+ end
384
+
385
+ insist { events[0]["A_COLUMN"] } == "fourth"
386
+ insist { events[0]["B_COLUMN"] } == "fifth"
387
+ insist { events[0]["C_COLUMN"] } == "sixth"
388
+ insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
389
+
390
+ end #it
391
+
392
+ it "should expire schema cache entries if untouched for more than their configured lifetime (10s in this case)" do
393
+
394
+ # This was tricky to write. Key points:
395
+ # - Utilizes a special white-box mode of the plugin that exposes what its doing with its schema cache in telemetry attributes.
396
+ # - While cache durations are typically in multiple hours, for testing we dial it back to 10s via a small fractional number.
397
+ # - All the various file IO has to go into the input block
398
+ # - The queue reads are sprinkled throughout to synchronize the test proc with logstash's file processing.
399
+ # - Put the insists right after the queue reads to better tie the inputs with the expected outputs.
400
+
401
+ puts "\nThe caching test now running will take a while... (~30s)"
402
+
403
+ tmpfile_path = Stud::Temporary.pathname
404
+ tmpfile2_path = Stud::Temporary.pathname
405
+ tmpfile3_path = Stud::Temporary.pathname
406
+ sincedb_path = Stud::Temporary.pathname
407
+
408
+ conf = <<-CONFIG
409
+ input {
410
+ csvfile {
411
+ path => "#{tmpfile_path}"
412
+ path => "#{tmpfile2_path}"
413
+ path => "#{tmpfile3_path}"
414
+ start_position => "beginning"
415
+ sincedb_path => "#{sincedb_path}"
416
+ delimiter => "#{delimiter}"
417
+ first_line_defines_columns => true
418
+ max_cached_schema_age_hours => 0.0027777777777778
419
+ add_schema_cache_telemetry_to_event => true
420
+ discover_interval => 1
421
+ }
422
+ }
423
+ CONFIG
424
+
425
+ events = input(conf) do |pipeline, queue|
426
+
427
+ # File1 Initial Entries. File 1's schema will be cached.
428
+ File.open(tmpfile_path, "a") do |fd|
429
+ fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
430
+ fd.puts("first,second,third")
431
+ end
432
+ # Verify File1 schema was cached and schema row was tagged as csvmetadata
433
+ event = queue.pop
434
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
435
+ insist { event["_csvmetadata"] } == true
436
+
437
+ # Verify that cached File1 schema was used to decode row2 of File1
438
+ event = queue.pop
439
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
440
+ insist { event["A_COLUMN"] } == "first"
441
+ insist { event["B_COLUMN"] } == "second"
442
+ insist { event["C_COLUMN"] } == "third"
443
+
444
+ # File2 Initial Entries
445
+ File.open(tmpfile2_path, "a") do |fd|
446
+ fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
447
+ fd.puts("1st,2nd,3rd")
448
+ end
449
+ # Verify File2 schema was cached and schema row was tagged as csvmetadata
450
+ event = queue.pop
451
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
452
+ insist { event["_csvmetadata"] } == true
453
+
454
+ # Verify that cached File2 schema was used to decode row2 of File2
455
+ event = queue.pop
456
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
457
+ insist { event["D_COLUMN"] } == "1st"
458
+ insist { event["E_COLUMN"] } == "2nd"
459
+ insist { event["F_COLUMN"] } == "3rd"
460
+
461
+ # Touch File1 before its cached schema entries expires (<10s), refreshing the entry.
462
+ sleep 5
463
+ File.open(tmpfile_path, "a") do |fd|
464
+ fd.puts("fourth,fifth,sixth")
465
+ end
466
+ # Verify that still-cached File1 schema was used to decode newly added row of File1
467
+ event = queue.pop
468
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
469
+ insist { event["A_COLUMN"] } == "fourth"
470
+ insist { event["B_COLUMN"] } == "fifth"
471
+ insist { event["C_COLUMN"] } == "sixth"
472
+
473
+ # Touch File1 again after File2's cache entry expires.
474
+ sleep 10
475
+ File.open(tmpfile_path, "a") do |fd|
476
+ fd.puts("seventh,eighth,ninth")
477
+ end
478
+ # Verify that File1's entry hasn't expired, by virtue of the previous touch refreshing it.
479
+ event = queue.pop
480
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
481
+ insist { event["A_COLUMN"] } == "seventh"
482
+ insist { event["B_COLUMN"] } == "eighth"
483
+ insist { event["C_COLUMN"] } == "ninth"
484
+
485
+ # Touch File3. Creation of its cache entry forces purge of File2's expired entry, which is made visible via telemetry.
486
+ sleep 1
487
+ File.open(tmpfile3_path, "a") do |fd|
488
+ fd.puts("X_COLUMN,Y_COLUMN,Z_COLUMN")
489
+ fd.puts("erste,zweite,dritte")
490
+ end
491
+ # Verify that scrubbing of expired cache entries takes place, reducing cached count from 2 (File1 & File2) to 1 (Just File1).
492
+ # (Scrubbing takes place before creation of File3's schema entry in the cache.)
493
+ event = queue.pop
494
+ insist { event["_csvmetadata"] } == true
495
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
496
+ insist { event["_schemacachetelemetryscrubbedbeforecount"] } == 2
497
+ insist { event["_schemacachetelemetryscrubbedaftercount"] } == 1
498
+
499
+ # Verify that File3's schema did in fact get cached.
500
+ event = queue.pop
501
+ insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
502
+ insist { event["X_COLUMN"] } == "erste"
503
+ insist { event["Y_COLUMN"] } == "zweite"
504
+ insist { event["Z_COLUMN"] } == "dritte"
505
+
506
+ # File2 post-expiration entry. Should re-create the File2 cache entry.
507
+ sleep 1
508
+ File.open(tmpfile2_path, "a") do |fd|
509
+ fd.puts("4th,5th,6th")
510
+ end
511
+ # Verify that File2's schema gets recreated (but not transmitted as an event since this isn't the natural row0 read).
512
+ event = queue.pop
513
+ insist { event["_schemacachetelemetry"] } == "newEntryCreated"
514
+ insist { event["D_COLUMN"] } == "4th"
515
+ insist { event["E_COLUMN"] } == "5th"
516
+ insist { event["F_COLUMN"] } == "6th"
517
+
518
+ end #input block
519
+ end #it
520
+
521
+ end
metadata ADDED
@@ -0,0 +1,124 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: logstash-input-csvfile
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.6
5
+ platform: ruby
6
+ authors:
7
+ - jweite
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-02-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - '>='
17
+ - !ruby/object:Gem::Version
18
+ version: 1.5.0
19
+ - - <
20
+ - !ruby/object:Gem::Version
21
+ version: 3.0.0
22
+ name: logstash-core
23
+ prerelease: false
24
+ type: :runtime
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - '>='
28
+ - !ruby/object:Gem::Version
29
+ version: 1.5.0
30
+ - - <
31
+ - !ruby/object:Gem::Version
32
+ version: 3.0.0
33
+ - !ruby/object:Gem::Dependency
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ name: logstash-codec-plain
40
+ prerelease: false
41
+ type: :runtime
42
+ version_requirements: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - '>='
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ - !ruby/object:Gem::Dependency
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: 1.0.1
53
+ name: logstash-input-file
54
+ prerelease: false
55
+ type: :runtime
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - '>='
59
+ - !ruby/object:Gem::Version
60
+ version: 1.0.1
61
+ - !ruby/object:Gem::Dependency
62
+ requirement: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ name: stud
68
+ prerelease: false
69
+ type: :runtime
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ - !ruby/object:Gem::Dependency
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ name: logstash-devutils
82
+ prerelease: false
83
+ type: :development
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - '>='
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
90
+ email: jweite@yahoo.com
91
+ executables: []
92
+ extensions: []
93
+ extra_rdoc_files: []
94
+ files:
95
+ - lib/logstash/inputs/csvfile.rb
96
+ - spec/inputs/csvfile_spec.rb
97
+ homepage: ''
98
+ licenses:
99
+ - Apache License (2.0)
100
+ metadata:
101
+ logstash_plugin: 'true'
102
+ logstash_group: input
103
+ post_install_message:
104
+ rdoc_options: []
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - '>='
115
+ - !ruby/object:Gem::Version
116
+ version: '0'
117
+ requirements: []
118
+ rubyforge_project:
119
+ rubygems_version: 2.4.5
120
+ signing_key:
121
+ specification_version: 4
122
+ summary: Extends logstash-input-file to parse csv files, optionally respecting 'first-line schemas'
123
+ test_files:
124
+ - spec/inputs/csvfile_spec.rb