logstash-input-csvfile 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/logstash/inputs/csvfile.rb +234 -0
- data/spec/inputs/csvfile_spec.rb +521 -0
- metadata +124 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: a3f4dde1199a122dfa627e710320adb46356b423
|
4
|
+
data.tar.gz: 91dfb639e4dbcc82d53297d6ef4b18b359042002
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 671ea579d1cc6ec358c7695f22a59f06f1a67f269389e384c9fc527aa662c4d8c0c5f373c6837f264da040cf177824c1e86e4e5f13fd5126cef0dba75370dd74
|
7
|
+
data.tar.gz: 86321481379fc771b41b7919576624446d41a6add7264e7862357fb71404cdea6c8d3b0327aeca916c6f3932a3866832ee5a136b992296de3b8862ca4c22fb11
|
@@ -0,0 +1,234 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require "logstash/inputs/file"
|
3
|
+
require "logstash/namespace"
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
# Subclass of logstash-input-file that parses CSV lines, with support for first-line schemas.
|
7
|
+
# Set first_line_defines_columns => true to enable this behavior.
|
8
|
+
# Statically defined columns are also supported, a la logstash-filter-csv, via the columns param.
|
9
|
+
# first_line_defines_columns => true takes precedence, though.
|
10
|
+
#
|
11
|
+
# Since multiple files may be being read by the same plugin instance, and each can have
|
12
|
+
# a distinct schema, this plugin records a schema per source file (as defined by the
|
13
|
+
# event's path attribute) in a hash. When it receives an event for a file it doesn't
|
14
|
+
# know it reads/parses that file's first line to obtain the schema. This method supports
|
15
|
+
# resuming processing after logstash restarts in mid-file.
|
16
|
+
#
|
17
|
+
# I considered extending logstash-filter-csv for to do this, but felt that the only reliable
|
18
|
+
# way to support streaming csv read was to explicitly read it from the file's schema row
|
19
|
+
# (and cache it so subsequent row performance for that file is good.) Since we cannot count
|
20
|
+
# on a logstash filter having read-access to the file, or even processing events that originate
|
21
|
+
# from files I rejected this approach. By definition, a file input plugin must have read-access
|
22
|
+
# to the file it's sourcing data from.
|
23
|
+
#
|
24
|
+
# This plugin borrows most of its csv parsing logic from logstash-filter-csv.
|
25
|
+
#
|
26
|
+
# This plugin extends logstash-input-file by overriding its decorate method. Note that
|
27
|
+
# logstash-input-plugin 0.0.10, released with Logstash 1.5, doesn't set the event's
|
28
|
+
# path element before calling decorate (which this plugin requires), so gemspec insists
|
29
|
+
# on logstash-input-file 1.1.0
|
30
|
+
#
|
31
|
+
|
32
|
+
class LogStash::Inputs::CSVFile < LogStash::Inputs::File
|
33
|
+
config_name "csvfile"
|
34
|
+
|
35
|
+
# Define a list of column names (in the order they appear in the CSV,
|
36
|
+
# as if it were a header line). If `columns` is not configured, or there
|
37
|
+
# are not enough columns specified, the default column names are
|
38
|
+
# "column1", "column2", etc. In the case that there are more columns
|
39
|
+
# in the data than specified in this column list, extra columns will be auto-numbered:
|
40
|
+
# (e.g. "user_defined_1", "user_defined_2", "column3", "column4", etc.)
|
41
|
+
config :columns, :validate => :array, :default => []
|
42
|
+
|
43
|
+
# Bool flag enables sourcing of column names from first event (line) of each file.
|
44
|
+
# A dynamic alternative to explicitly defining columns in the column attribute
|
45
|
+
config :first_line_defines_columns, :validate => :boolean, :default => false
|
46
|
+
|
47
|
+
# Define the column separator value. If this is not specified, the default
|
48
|
+
# is a comma `,`.
|
49
|
+
# Optional.
|
50
|
+
config :separator, :validate => :string, :default => ","
|
51
|
+
|
52
|
+
# Define the character used to quote CSV fields. If this is not specified
|
53
|
+
# the default is a double quote `"`.
|
54
|
+
# Optional.
|
55
|
+
config :quote_char, :validate => :string, :default => '"'
|
56
|
+
|
57
|
+
# Define target field for placing the data.
|
58
|
+
# Defaults to writing to the root of the event.
|
59
|
+
config :target, :validate => :string
|
60
|
+
|
61
|
+
# The maximum time a csv file's schema can be unused (in hours) before
|
62
|
+
# it is automatically scrubbed to avoid memory leakage.
|
63
|
+
# If an event for that file arrives subsequently the schema will be
|
64
|
+
# reconstituted (albeit with the penalty of schema row re-read from file).
|
65
|
+
#
|
66
|
+
# Cache scrubbing occurs inline only when new new files are detected to minimize
|
67
|
+
# perf impact on most CSV events. Since new file detection time is the only time
|
68
|
+
# the cache actually grows, and we're expecting to pay the schema-read penalty then
|
69
|
+
# anyway, it's an optimal time to scrub.
|
70
|
+
#
|
71
|
+
# 0 disables, but memory will grow. OK if you're routinely restarting logstash.
|
72
|
+
config :max_cached_schema_age_hours, :validate => :number, :default => 24
|
73
|
+
|
74
|
+
# To handle cases where there's other content in the file before the schema row
|
75
|
+
# that you'll want to ignore. For instance, you can skip leading blank lines
|
76
|
+
# before the schema by matching to non-blank lines using "^.+$"
|
77
|
+
# Note that the plugin will still emit events for pre-schema rows, albeit with
|
78
|
+
# no attributes (for blank lines) or default-named attributes (if the pre-schema
|
79
|
+
# lines do parse as valid CSV).
|
80
|
+
config :schema_pattern_to_match, :validate => :string
|
81
|
+
|
82
|
+
# To support testing. Adds attributes to events regarding schema cache behavior.
|
83
|
+
config :add_schema_cache_telemetry_to_event, :validate => :boolean, :default => false
|
84
|
+
|
85
|
+
public
|
86
|
+
def register
|
87
|
+
@fileColumns = Hash.new
|
88
|
+
@schemaTouchedTimes = Hash.new
|
89
|
+
super()
|
90
|
+
|
91
|
+
@logger.warn("schema cache scrubbing disabled. Memory use will grow over time.") if @max_cached_schema_age_hours <= 0
|
92
|
+
end
|
93
|
+
|
94
|
+
def decorate(event)
|
95
|
+
super(event)
|
96
|
+
|
97
|
+
message = event["message"]
|
98
|
+
return if !message
|
99
|
+
|
100
|
+
begin
|
101
|
+
values = CSV.parse_line(message, :col_sep => @separator, :quote_char => @quote_char)
|
102
|
+
return if values.length == 0
|
103
|
+
|
104
|
+
# Get names for the columns.
|
105
|
+
if @first_line_defines_columns
|
106
|
+
@logger.debug? && @logger.debug("handling csv in first_line_defines_columns mode", :message => message, :columns => @columns)
|
107
|
+
cols = getSchemaForFile(event, values)
|
108
|
+
else
|
109
|
+
@logger.debug? && @logger.debug("handling csv in explicitly defined columns mode", :message => message, :columns => @columns)
|
110
|
+
cols = @columns
|
111
|
+
end
|
112
|
+
|
113
|
+
# Determine where to write the new attributes
|
114
|
+
if @target.nil?
|
115
|
+
# Default is to write to the root of the event.
|
116
|
+
dest = event
|
117
|
+
else
|
118
|
+
dest = event[@target] ||= {}
|
119
|
+
end
|
120
|
+
|
121
|
+
# Add the per-column attributes (as long as this isn't the event from the schema defining row)
|
122
|
+
if !event["_csvmetadata"]
|
123
|
+
values.each_index do |i|
|
124
|
+
field_name = cols[i] || "column#{i+1}"
|
125
|
+
dest[field_name] = values[i]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
rescue => e
|
130
|
+
event.tag "_csvparsefailure"
|
131
|
+
@logger.warn("Trouble parsing csv", :message => message, :exception => e)
|
132
|
+
return
|
133
|
+
end # begin
|
134
|
+
end # decorate()
|
135
|
+
|
136
|
+
def getSchemaForFile(event, parsedValues)
|
137
|
+
path = event["path"]
|
138
|
+
if !path
|
139
|
+
@logger.warn("No path in event. Cannot retrieve a schema for this event.")
|
140
|
+
return []
|
141
|
+
end
|
142
|
+
|
143
|
+
@logger.debug? && @logger.debug("Getting schema for file", :path => path)
|
144
|
+
|
145
|
+
schema = getCachedSchemaForFile(path)
|
146
|
+
if schema
|
147
|
+
@logger.debug? && @logger.debug("Using cached schema", :cols => schema)
|
148
|
+
event["_schemacachetelemetry"]="cachedEntryUsed" if @add_schema_cache_telemetry_to_event
|
149
|
+
touchSchema(path)
|
150
|
+
return schema
|
151
|
+
end
|
152
|
+
|
153
|
+
@logger.debug? && @logger.debug("Event from unknown file/schema. Reading schema from that file.", :path => path)
|
154
|
+
|
155
|
+
scrubSchemaCache(event) if @max_cached_schema_age_hours > 0
|
156
|
+
|
157
|
+
csvFileLine = readSchemaLineFromFile(path)
|
158
|
+
if !csvFileLine || csvFileLine.length == 0
|
159
|
+
@logger.warn("No suitable schema row found in file.", :path => path)
|
160
|
+
return []
|
161
|
+
end
|
162
|
+
|
163
|
+
schema = CSV.parse_line(csvFileLine, :col_sep => @separator, :quote_char => @quote_char)
|
164
|
+
addSchemaToCache(path, schema)
|
165
|
+
@logger.debug? && @logger.debug("Schema read from file:", :path => path, :cols => schema)
|
166
|
+
|
167
|
+
if @add_schema_cache_telemetry_to_event
|
168
|
+
event["_schemacachetelemetry"]="newEntryCreated"
|
169
|
+
event["_cache_touch_time"]=Time.now
|
170
|
+
end
|
171
|
+
|
172
|
+
# Special handling for the schema row event: tag _csvmetadata and don't return individual column attributes
|
173
|
+
if @fileColumns[path].join == parsedValues.join
|
174
|
+
@logger.debug? && @logger.debug("Received the schema row event. Tagging w/ _csvmetadata", :message => message)
|
175
|
+
event["_csvmetadata"] = true
|
176
|
+
return []
|
177
|
+
else
|
178
|
+
return schema
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|
183
|
+
def getCachedSchemaForFile(path)
|
184
|
+
@fileColumns[path]
|
185
|
+
end
|
186
|
+
|
187
|
+
def addSchemaToCache(path, schema)
|
188
|
+
@fileColumns[path] = schema
|
189
|
+
touchSchema(path)
|
190
|
+
end
|
191
|
+
|
192
|
+
def touchSchema(path)
|
193
|
+
@schemaTouchedTimes[path] = Time.now
|
194
|
+
end
|
195
|
+
|
196
|
+
def readSchemaLineFromFile(path)
|
197
|
+
csvFileLine = ""
|
198
|
+
File.open(path, "r") do |f|
|
199
|
+
while csvFileLine.length == 0 and csvFileLine = f.gets
|
200
|
+
if @schema_pattern_to_match
|
201
|
+
if !csvFileLine.end_with?("\n") or !csvFileLine.match(@schema_pattern_to_match)
|
202
|
+
csvFileLine = ""
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
csvFileLine
|
208
|
+
end
|
209
|
+
|
210
|
+
def scrubSchemaCache(event)
|
211
|
+
@logger.debug? && @logger.debug("Scrubbing schema cache", :size => @fileColumns.length)
|
212
|
+
event["_schemacachetelemetryscrubbedbeforecount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
|
213
|
+
|
214
|
+
expiringFiles = []
|
215
|
+
now = Time.now
|
216
|
+
@schemaTouchedTimes.each do |filename, lastReadTime|
|
217
|
+
if (lastReadTime + (@max_cached_schema_age_hours * 60 * 60)) < now
|
218
|
+
expiringFiles << filename
|
219
|
+
@logger.debug? && @logger.debug("Expiring schema for: ", :file => filename, :lastRead => lastReadTime)
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
expiringFiles.each do |filename|
|
224
|
+
@fileColumns.delete(filename)
|
225
|
+
@schemaTouchedTimes.delete(filename)
|
226
|
+
@logger.debug? && @logger.debug("Deleted schema for: ", :file => filename)
|
227
|
+
end
|
228
|
+
|
229
|
+
event["_schemacachetelemetryscrubbedaftercount"]=@fileColumns.length if @add_schema_cache_telemetry_to_event
|
230
|
+
@logger.debug? && @logger.debug("Done scrubbing schema cache", :size => @fileColumns.length)
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
end # class LogStash::Inputs::CSVFile
|
@@ -0,0 +1,521 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require "logstash/devutils/rspec/spec_helper"
|
4
|
+
require "tempfile"
|
5
|
+
require "stud/temporary"
|
6
|
+
require "logstash/inputs/csvfile"
|
7
|
+
|
8
|
+
describe "inputs/csvfile" do
|
9
|
+
|
10
|
+
delimiter = (LogStash::Environment.windows? ? "\r\n" : "\n")
|
11
|
+
|
12
|
+
#Borrowed this first check from file_spec.rb verbatim to get the pipeline running...
|
13
|
+
it "should starts at the end of an existing file" do
|
14
|
+
tmpfile_path = Stud::Temporary.pathname
|
15
|
+
sincedb_path = Stud::Temporary.pathname
|
16
|
+
|
17
|
+
conf = <<-CONFIG
|
18
|
+
input {
|
19
|
+
file {
|
20
|
+
type => "blah"
|
21
|
+
path => "#{tmpfile_path}"
|
22
|
+
sincedb_path => "#{sincedb_path}"
|
23
|
+
delimiter => "#{delimiter}"
|
24
|
+
}
|
25
|
+
}
|
26
|
+
CONFIG
|
27
|
+
|
28
|
+
File.open(tmpfile_path, "w") do |fd|
|
29
|
+
fd.puts("ignore me 1")
|
30
|
+
fd.puts("ignore me 2")
|
31
|
+
end
|
32
|
+
|
33
|
+
events = input(conf) do |pipeline, queue|
|
34
|
+
|
35
|
+
# at this point the plugins
|
36
|
+
# threads might still be initializing so we cannot know when the
|
37
|
+
# file plugin will have seen the original file, it could see it
|
38
|
+
# after the first(s) hello world appends below, hence the
|
39
|
+
# retry logic.
|
40
|
+
|
41
|
+
events = []
|
42
|
+
|
43
|
+
retries = 0
|
44
|
+
while retries < 20
|
45
|
+
File.open(tmpfile_path, "a") do |fd|
|
46
|
+
fd.puts("hello")
|
47
|
+
fd.puts("world")
|
48
|
+
end
|
49
|
+
|
50
|
+
if queue.size >= 2
|
51
|
+
events = 2.times.collect { queue.pop }
|
52
|
+
break
|
53
|
+
end
|
54
|
+
|
55
|
+
sleep(0.1)
|
56
|
+
retries += 1
|
57
|
+
end
|
58
|
+
|
59
|
+
events
|
60
|
+
end #input block
|
61
|
+
|
62
|
+
insist { events[0]["message"] } == "hello"
|
63
|
+
insist { events[1]["message"] } == "world"
|
64
|
+
end #it
|
65
|
+
|
66
|
+
it "should parse csv columns into event attributes using default column names" do
|
67
|
+
tmpfile_path = Stud::Temporary.pathname
|
68
|
+
sincedb_path = Stud::Temporary.pathname
|
69
|
+
|
70
|
+
conf = <<-CONFIG
|
71
|
+
input {
|
72
|
+
csvfile {
|
73
|
+
path => "#{tmpfile_path}"
|
74
|
+
start_position => "beginning"
|
75
|
+
sincedb_path => "#{sincedb_path}"
|
76
|
+
delimiter => "#{delimiter}"
|
77
|
+
}
|
78
|
+
}
|
79
|
+
CONFIG
|
80
|
+
|
81
|
+
File.open(tmpfile_path, "a") do |fd|
|
82
|
+
fd.puts("first,second,third")
|
83
|
+
fd.puts('"fou,rth","fifth"') #Quoting check
|
84
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
85
|
+
end
|
86
|
+
|
87
|
+
events = input(conf) do |pipeline, queue|
|
88
|
+
3.times.collect { queue.pop }
|
89
|
+
end
|
90
|
+
|
91
|
+
insist { events[0]["column1"] } == "first"
|
92
|
+
insist { events[0]["column2"] } == "second"
|
93
|
+
insist { events[0]["column3"] } == "third"
|
94
|
+
insist { events[1]["column1"] } == "fou,rth" #Not a typo: quoting check
|
95
|
+
insist { events[1]["column2"] } == "fifth"
|
96
|
+
insist { events[2]["column1"] } == "sixth"
|
97
|
+
insist { events[2]["column2"] } == "seventh"
|
98
|
+
insist { events[2]["column3"] } == "eighth"
|
99
|
+
insist { events[2]["column4"] } == "ninth"
|
100
|
+
|
101
|
+
end #it
|
102
|
+
|
103
|
+
it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
|
104
|
+
tmpfile_path = Stud::Temporary.pathname
|
105
|
+
sincedb_path = Stud::Temporary.pathname
|
106
|
+
|
107
|
+
conf = <<-CONFIG
|
108
|
+
input {
|
109
|
+
csvfile {
|
110
|
+
path => "#{tmpfile_path}"
|
111
|
+
start_position => "beginning"
|
112
|
+
sincedb_path => "#{sincedb_path}"
|
113
|
+
delimiter => "#{delimiter}"
|
114
|
+
separator => ";"
|
115
|
+
columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
|
116
|
+
}
|
117
|
+
}
|
118
|
+
CONFIG
|
119
|
+
|
120
|
+
File.open(tmpfile_path, "a") do |fd|
|
121
|
+
fd.puts("first;second;third")
|
122
|
+
fd.puts("fourth;fifth")
|
123
|
+
fd.puts("sixth;sev,enth;eighth;ninth")
|
124
|
+
end
|
125
|
+
|
126
|
+
events = input(conf) do |pipeline, queue|
|
127
|
+
3.times.collect { queue.pop }
|
128
|
+
end
|
129
|
+
|
130
|
+
insist { events[0]["FIRST_COL"] } == "first"
|
131
|
+
insist { events[0]["SECOND_COL"] } == "second"
|
132
|
+
insist { events[0]["THIRD_COL"] } == "third"
|
133
|
+
insist { events[1]["FIRST_COL"] } == "fourth"
|
134
|
+
insist { events[1]["SECOND_COL"] } == "fifth"
|
135
|
+
insist { events[2]["FIRST_COL"] } == "sixth"
|
136
|
+
insist { events[2]["SECOND_COL"] } == "sev,enth"
|
137
|
+
insist { events[2]["THIRD_COL"] } == "eighth"
|
138
|
+
insist { events[2]["column4"] } == "ninth"
|
139
|
+
|
140
|
+
end #it
|
141
|
+
|
142
|
+
it "should parse csv columns into attributes using column names defined on the csv files 0th row with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
|
143
|
+
tmpfile_path = Stud::Temporary.pathname
|
144
|
+
tmpfile2_path = Stud::Temporary.pathname
|
145
|
+
sincedb_path = Stud::Temporary.pathname
|
146
|
+
|
147
|
+
conf = <<-CONFIG
|
148
|
+
input {
|
149
|
+
csvfile {
|
150
|
+
path => "#{tmpfile_path}"
|
151
|
+
path => "#{tmpfile2_path}"
|
152
|
+
start_position => "beginning"
|
153
|
+
sincedb_path => "#{sincedb_path}"
|
154
|
+
delimiter => "#{delimiter}"
|
155
|
+
first_line_defines_columns => true
|
156
|
+
}
|
157
|
+
}
|
158
|
+
CONFIG
|
159
|
+
|
160
|
+
File.open(tmpfile_path, "a") do |fd|
|
161
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
162
|
+
fd.puts("first,second,third")
|
163
|
+
fd.puts("fourth,fifth")
|
164
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
165
|
+
end
|
166
|
+
|
167
|
+
events = input(conf) do |pipeline, queue|
|
168
|
+
4.times.collect { queue.pop }
|
169
|
+
end
|
170
|
+
|
171
|
+
insist { events[0]["_csvmetadata"] } == true
|
172
|
+
insist { events[1]["A_COLUMN"] } == "first"
|
173
|
+
insist { events[1]["B_COLUMN"] } == "second"
|
174
|
+
insist { events[1]["C_COLUMN"] } == "third"
|
175
|
+
insist { events[2]["A_COLUMN"] } == "fourth"
|
176
|
+
insist { events[2]["B_COLUMN"] } == "fifth"
|
177
|
+
insist { events[3]["A_COLUMN"] } == "sixth"
|
178
|
+
insist { events[3]["B_COLUMN"] } == "seventh"
|
179
|
+
insist { events[3]["C_COLUMN"] } == "eighth"
|
180
|
+
insist { events[3]["column4"] } == "ninth"
|
181
|
+
|
182
|
+
File.open(tmpfile2_path, "a") do |fd|
|
183
|
+
fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
|
184
|
+
fd.puts("first,second,third")
|
185
|
+
fd.puts("fourth,fifth")
|
186
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
187
|
+
end
|
188
|
+
|
189
|
+
events = input(conf) do |pipeline, queue|
|
190
|
+
4.times.collect { queue.pop }
|
191
|
+
end
|
192
|
+
|
193
|
+
insist { events[0]["_csvmetadata"] } == true
|
194
|
+
insist { events[1]["D_COLUMN"] } == "first"
|
195
|
+
insist { events[1]["E_COLUMN"] } == "second"
|
196
|
+
insist { events[1]["F_COLUMN"] } == "third"
|
197
|
+
insist { events[2]["D_COLUMN"] } == "fourth"
|
198
|
+
insist { events[2]["E_COLUMN"] } == "fifth"
|
199
|
+
insist { events[3]["D_COLUMN"] } == "sixth"
|
200
|
+
insist { events[3]["E_COLUMN"] } == "seventh"
|
201
|
+
insist { events[3]["F_COLUMN"] } == "eighth"
|
202
|
+
insist { events[3]["column4"] } == "ninth"
|
203
|
+
|
204
|
+
end #it
|
205
|
+
|
206
|
+
it "should parse csv columns into attributes using column names defined on the first file row that matches the schema_pattern_to_match with each csv file defining its own independent schema; it should tag schema row events as _csvmetadata" do
|
207
|
+
tmpfile_path = Stud::Temporary.pathname
|
208
|
+
sincedb_path = Stud::Temporary.pathname
|
209
|
+
|
210
|
+
conf = <<-CONFIG
|
211
|
+
input {
|
212
|
+
csvfile {
|
213
|
+
path => "#{tmpfile_path}"
|
214
|
+
start_position => "beginning"
|
215
|
+
sincedb_path => "#{sincedb_path}"
|
216
|
+
delimiter => "#{delimiter}"
|
217
|
+
first_line_defines_columns => true
|
218
|
+
schema_pattern_to_match => "^.+$"
|
219
|
+
}
|
220
|
+
}
|
221
|
+
CONFIG
|
222
|
+
|
223
|
+
File.open(tmpfile_path, "a") do |fd|
|
224
|
+
fd.puts("")
|
225
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
226
|
+
fd.puts("first,second,third")
|
227
|
+
fd.puts("fourth,fifth")
|
228
|
+
fd.puts("sixth,seventh,eighth,ninth")
|
229
|
+
end
|
230
|
+
|
231
|
+
events = input(conf) do |pipeline, queue|
|
232
|
+
5.times.collect { queue.pop }
|
233
|
+
end
|
234
|
+
|
235
|
+
insist { events[1]["_csvmetadata"] } == true
|
236
|
+
insist { events[2]["A_COLUMN"] } == "first"
|
237
|
+
insist { events[2]["B_COLUMN"] } == "second"
|
238
|
+
insist { events[2]["C_COLUMN"] } == "third"
|
239
|
+
insist { events[3]["A_COLUMN"] } == "fourth"
|
240
|
+
insist { events[3]["B_COLUMN"] } == "fifth"
|
241
|
+
insist { events[4]["A_COLUMN"] } == "sixth"
|
242
|
+
insist { events[4]["B_COLUMN"] } == "seventh"
|
243
|
+
insist { events[4]["C_COLUMN"] } == "eighth"
|
244
|
+
insist { events[4]["column4"] } == "ninth"
|
245
|
+
|
246
|
+
end #it
|
247
|
+
|
248
|
+
it "should parse csv columns into attributes using explicitly defined column names, default-naming any excess columns; non-default csv separator" do
|
249
|
+
tmpfile_path = Stud::Temporary.pathname
|
250
|
+
sincedb_path = Stud::Temporary.pathname
|
251
|
+
|
252
|
+
conf = <<-CONFIG
|
253
|
+
input {
|
254
|
+
csvfile {
|
255
|
+
path => "#{tmpfile_path}"
|
256
|
+
start_position => "beginning"
|
257
|
+
sincedb_path => "#{sincedb_path}"
|
258
|
+
delimiter => "#{delimiter}"
|
259
|
+
separator => ";"
|
260
|
+
columns => ["FIRST_COL","SECOND_COL","THIRD_COL"]
|
261
|
+
}
|
262
|
+
}
|
263
|
+
CONFIG
|
264
|
+
|
265
|
+
File.open(tmpfile_path, "a") do |fd|
|
266
|
+
fd.puts("first;second;third")
|
267
|
+
fd.puts("fourth;fifth")
|
268
|
+
fd.puts("sixth;sev,enth;eighth;ninth")
|
269
|
+
end
|
270
|
+
|
271
|
+
events = input(conf) do |pipeline, queue|
|
272
|
+
3.times.collect { queue.pop }
|
273
|
+
end
|
274
|
+
|
275
|
+
insist { events[0]["FIRST_COL"] } == "first"
|
276
|
+
insist { events[0]["SECOND_COL"] } == "second"
|
277
|
+
insist { events[0]["THIRD_COL"] } == "third"
|
278
|
+
insist { events[1]["FIRST_COL"] } == "fourth"
|
279
|
+
insist { events[1]["SECOND_COL"] } == "fifth"
|
280
|
+
insist { events[2]["FIRST_COL"] } == "sixth"
|
281
|
+
insist { events[2]["SECOND_COL"] } == "sev,enth"
|
282
|
+
insist { events[2]["THIRD_COL"] } == "eighth"
|
283
|
+
insist { events[2]["column4"] } == "ninth"
|
284
|
+
|
285
|
+
end #it
|
286
|
+
|
287
|
+
it "should cache schemas per file" do
|
288
|
+
tmpfile_path = Stud::Temporary.pathname
|
289
|
+
tmpfile2_path = Stud::Temporary.pathname
|
290
|
+
sincedb_path = Stud::Temporary.pathname
|
291
|
+
|
292
|
+
conf = <<-CONFIG
|
293
|
+
input {
|
294
|
+
csvfile {
|
295
|
+
path => "#{tmpfile_path}"
|
296
|
+
path => "#{tmpfile2_path}"
|
297
|
+
start_position => "beginning"
|
298
|
+
sincedb_path => "#{sincedb_path}"
|
299
|
+
delimiter => "#{delimiter}"
|
300
|
+
first_line_defines_columns => true
|
301
|
+
add_schema_cache_telemetry_to_event => true
|
302
|
+
}
|
303
|
+
}
|
304
|
+
CONFIG
|
305
|
+
|
306
|
+
|
307
|
+
events = input(conf) do |pipeline, queue|
|
308
|
+
File.open(tmpfile_path, "a") do |fd|
|
309
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
310
|
+
fd.puts("first,second,third")
|
311
|
+
end
|
312
|
+
|
313
|
+
sleep 1
|
314
|
+
|
315
|
+
File.open(tmpfile2_path, "a") do |fd|
|
316
|
+
fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
|
317
|
+
fd.puts("1st,2nd,3rd")
|
318
|
+
end
|
319
|
+
|
320
|
+
4.times.collect { queue.pop }
|
321
|
+
end
|
322
|
+
|
323
|
+
insist { events[0]["_csvmetadata"] } == true
|
324
|
+
insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
|
325
|
+
|
326
|
+
insist { events[1]["A_COLUMN"] } == "first"
|
327
|
+
insist { events[1]["B_COLUMN"] } == "second"
|
328
|
+
insist { events[1]["C_COLUMN"] } == "third"
|
329
|
+
insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
|
330
|
+
|
331
|
+
insist { events[2]["_csvmetadata"] } == true
|
332
|
+
insist { events[2]["_schemacachetelemetry"] } == "newEntryCreated"
|
333
|
+
|
334
|
+
insist { events[3]["D_COLUMN"] } == "1st"
|
335
|
+
insist { events[3]["E_COLUMN"] } == "2nd"
|
336
|
+
insist { events[3]["F_COLUMN"] } == "3rd"
|
337
|
+
insist { events[3]["_schemacachetelemetry"] } == "cachedEntryUsed"
|
338
|
+
|
339
|
+
end #it
|
340
|
+
|
341
|
+
it "should resume processing of a csv file after logstash restarts" do
|
342
|
+
tmpfile_path = Stud::Temporary.pathname
|
343
|
+
sincedb_path = Stud::Temporary.pathname
|
344
|
+
|
345
|
+
# Set up to expire cache entries after 10s of being untouched. Request that telemetry be added to the event to make cache usage visible.
|
346
|
+
conf = <<-CONFIG
|
347
|
+
input {
|
348
|
+
csvfile {
|
349
|
+
path => "#{tmpfile_path}"
|
350
|
+
start_position => "beginning"
|
351
|
+
sincedb_path => "#{sincedb_path}"
|
352
|
+
delimiter => "#{delimiter}"
|
353
|
+
first_line_defines_columns => true
|
354
|
+
add_schema_cache_telemetry_to_event => true
|
355
|
+
}
|
356
|
+
}
|
357
|
+
CONFIG
|
358
|
+
|
359
|
+
|
360
|
+
File.open(tmpfile_path, "a") do |fd|
|
361
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
362
|
+
fd.puts("first,second,third")
|
363
|
+
end
|
364
|
+
|
365
|
+
events = input(conf) do |pipeline, queue|
|
366
|
+
2.times.collect { queue.pop }
|
367
|
+
end
|
368
|
+
|
369
|
+
insist { events[0]["_csvmetadata"] } == true
|
370
|
+
insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
|
371
|
+
|
372
|
+
insist { events[1]["A_COLUMN"] } == "first"
|
373
|
+
insist { events[1]["B_COLUMN"] } == "second"
|
374
|
+
insist { events[1]["C_COLUMN"] } == "third"
|
375
|
+
insist { events[1]["_schemacachetelemetry"] } == "cachedEntryUsed"
|
376
|
+
|
377
|
+
File.open(tmpfile_path, "a") do |fd|
|
378
|
+
fd.puts("fourth,fifth,sixth")
|
379
|
+
end
|
380
|
+
|
381
|
+
events = input(conf) do |pipeline, queue|
|
382
|
+
1.times.collect { queue.pop }
|
383
|
+
end
|
384
|
+
|
385
|
+
insist { events[0]["A_COLUMN"] } == "fourth"
|
386
|
+
insist { events[0]["B_COLUMN"] } == "fifth"
|
387
|
+
insist { events[0]["C_COLUMN"] } == "sixth"
|
388
|
+
insist { events[0]["_schemacachetelemetry"] } == "newEntryCreated"
|
389
|
+
|
390
|
+
end #it
|
391
|
+
|
392
|
+
it "should expire schema cache entries if untouched for more than their configured lifetime (10s in this case)" do
|
393
|
+
|
394
|
+
# This was tricky to write. Key points:
|
395
|
+
# - Utilizes a special white-box mode of the plugin that exposes what its doing with its schema cache in telemetry attributes.
|
396
|
+
# - While cache durations are typically in multiple hours, for testing we dial it back to 10s via a small fractional number.
|
397
|
+
# - All the various file IO has to go into the input block
|
398
|
+
# - The queue reads are sprinkled throughout to synchronize the test proc with logstash's file processing.
|
399
|
+
# - Put the insists right after the queue reads to better tie the inputs with the expected outputs.
|
400
|
+
|
401
|
+
puts "\nThe caching test now running will take a while... (~30s)"
|
402
|
+
|
403
|
+
tmpfile_path = Stud::Temporary.pathname
|
404
|
+
tmpfile2_path = Stud::Temporary.pathname
|
405
|
+
tmpfile3_path = Stud::Temporary.pathname
|
406
|
+
sincedb_path = Stud::Temporary.pathname
|
407
|
+
|
408
|
+
conf = <<-CONFIG
|
409
|
+
input {
|
410
|
+
csvfile {
|
411
|
+
path => "#{tmpfile_path}"
|
412
|
+
path => "#{tmpfile2_path}"
|
413
|
+
path => "#{tmpfile3_path}"
|
414
|
+
start_position => "beginning"
|
415
|
+
sincedb_path => "#{sincedb_path}"
|
416
|
+
delimiter => "#{delimiter}"
|
417
|
+
first_line_defines_columns => true
|
418
|
+
max_cached_schema_age_hours => 0.0027777777777778
|
419
|
+
add_schema_cache_telemetry_to_event => true
|
420
|
+
discover_interval => 1
|
421
|
+
}
|
422
|
+
}
|
423
|
+
CONFIG
|
424
|
+
|
425
|
+
events = input(conf) do |pipeline, queue|
|
426
|
+
|
427
|
+
# File1 Initial Entries. File 1's schema will be cached.
|
428
|
+
File.open(tmpfile_path, "a") do |fd|
|
429
|
+
fd.puts("A_COLUMN,B_COLUMN,C_COLUMN")
|
430
|
+
fd.puts("first,second,third")
|
431
|
+
end
|
432
|
+
# Verify File1 schema was cached and schema row was tagged as csvmetadata
|
433
|
+
event = queue.pop
|
434
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
435
|
+
insist { event["_csvmetadata"] } == true
|
436
|
+
|
437
|
+
# Verify that cached File1 schema was used to decode row2 of File1
|
438
|
+
event = queue.pop
|
439
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
440
|
+
insist { event["A_COLUMN"] } == "first"
|
441
|
+
insist { event["B_COLUMN"] } == "second"
|
442
|
+
insist { event["C_COLUMN"] } == "third"
|
443
|
+
|
444
|
+
# File2 Initial Entries
|
445
|
+
File.open(tmpfile2_path, "a") do |fd|
|
446
|
+
fd.puts("D_COLUMN,E_COLUMN,F_COLUMN")
|
447
|
+
fd.puts("1st,2nd,3rd")
|
448
|
+
end
|
449
|
+
# Verify File2 schema was cached and schema row was tagged as csvmetadata
|
450
|
+
event = queue.pop
|
451
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
452
|
+
insist { event["_csvmetadata"] } == true
|
453
|
+
|
454
|
+
# Verify that cached File2 schema was used to decode row2 of File2
|
455
|
+
event = queue.pop
|
456
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
457
|
+
insist { event["D_COLUMN"] } == "1st"
|
458
|
+
insist { event["E_COLUMN"] } == "2nd"
|
459
|
+
insist { event["F_COLUMN"] } == "3rd"
|
460
|
+
|
461
|
+
# Touch File1 before its cached schema entries expires (<10s), refreshing the entry.
|
462
|
+
sleep 5
|
463
|
+
File.open(tmpfile_path, "a") do |fd|
|
464
|
+
fd.puts("fourth,fifth,sixth")
|
465
|
+
end
|
466
|
+
# Verify that still-cached File1 schema was used to decode newly added row of File1
|
467
|
+
event = queue.pop
|
468
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
469
|
+
insist { event["A_COLUMN"] } == "fourth"
|
470
|
+
insist { event["B_COLUMN"] } == "fifth"
|
471
|
+
insist { event["C_COLUMN"] } == "sixth"
|
472
|
+
|
473
|
+
# Touch File1 again after File2's cache entry expires.
|
474
|
+
sleep 10
|
475
|
+
File.open(tmpfile_path, "a") do |fd|
|
476
|
+
fd.puts("seventh,eighth,ninth")
|
477
|
+
end
|
478
|
+
# Verify that File1's entry hasn't expired, by virtue of the previous touch refreshing it.
|
479
|
+
event = queue.pop
|
480
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
481
|
+
insist { event["A_COLUMN"] } == "seventh"
|
482
|
+
insist { event["B_COLUMN"] } == "eighth"
|
483
|
+
insist { event["C_COLUMN"] } == "ninth"
|
484
|
+
|
485
|
+
# Touch File3. Creation of its cache entry forces purge of File2's expired entry, which is made visible via telemetry.
|
486
|
+
sleep 1
|
487
|
+
File.open(tmpfile3_path, "a") do |fd|
|
488
|
+
fd.puts("X_COLUMN,Y_COLUMN,Z_COLUMN")
|
489
|
+
fd.puts("erste,zweite,dritte")
|
490
|
+
end
|
491
|
+
# Verify that scrubbing of expired cache entries takes place, reducing cached count from 2 (File1 & File2) to 1 (Just File1).
|
492
|
+
# (Scrubbing takes place before creation of File3's schema entry in the cache.)
|
493
|
+
event = queue.pop
|
494
|
+
insist { event["_csvmetadata"] } == true
|
495
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
496
|
+
insist { event["_schemacachetelemetryscrubbedbeforecount"] } == 2
|
497
|
+
insist { event["_schemacachetelemetryscrubbedaftercount"] } == 1
|
498
|
+
|
499
|
+
# Verify that File3's schema did in fact get cached.
|
500
|
+
event = queue.pop
|
501
|
+
insist { event["_schemacachetelemetry"] } == "cachedEntryUsed"
|
502
|
+
insist { event["X_COLUMN"] } == "erste"
|
503
|
+
insist { event["Y_COLUMN"] } == "zweite"
|
504
|
+
insist { event["Z_COLUMN"] } == "dritte"
|
505
|
+
|
506
|
+
# File2 post-expiration entry. Should re-create the File2 cache entry.
|
507
|
+
sleep 1
|
508
|
+
File.open(tmpfile2_path, "a") do |fd|
|
509
|
+
fd.puts("4th,5th,6th")
|
510
|
+
end
|
511
|
+
# Verify that File2's schema gets recreated (but not transmitted as an event since this isn't the natural row0 read).
|
512
|
+
event = queue.pop
|
513
|
+
insist { event["_schemacachetelemetry"] } == "newEntryCreated"
|
514
|
+
insist { event["D_COLUMN"] } == "4th"
|
515
|
+
insist { event["E_COLUMN"] } == "5th"
|
516
|
+
insist { event["F_COLUMN"] } == "6th"
|
517
|
+
|
518
|
+
end #input block
|
519
|
+
end #it
|
520
|
+
|
521
|
+
end
|
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: logstash-input-csvfile
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.6
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- jweite
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-02-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - '>='
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: 1.5.0
|
19
|
+
- - <
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 3.0.0
|
22
|
+
name: logstash-core
|
23
|
+
prerelease: false
|
24
|
+
type: :runtime
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.5.0
|
30
|
+
- - <
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.0.0
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
name: logstash-codec-plain
|
40
|
+
prerelease: false
|
41
|
+
type: :runtime
|
42
|
+
version_requirements: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.0.1
|
53
|
+
name: logstash-input-file
|
54
|
+
prerelease: false
|
55
|
+
type: :runtime
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - '>='
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.0.1
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
requirement: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
name: stud
|
68
|
+
prerelease: false
|
69
|
+
type: :runtime
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - '>='
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
requirement: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
name: logstash-devutils
|
82
|
+
prerelease: false
|
83
|
+
type: :development
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
description: This gem is a logstash plugin required to be installed on top of the Logstash core pipeline using $LS_HOME/bin/plugin install gemname. This gem is not a stand-alone program
|
90
|
+
email: jweite@yahoo.com
|
91
|
+
executables: []
|
92
|
+
extensions: []
|
93
|
+
extra_rdoc_files: []
|
94
|
+
files:
|
95
|
+
- lib/logstash/inputs/csvfile.rb
|
96
|
+
- spec/inputs/csvfile_spec.rb
|
97
|
+
homepage: ''
|
98
|
+
licenses:
|
99
|
+
- Apache License (2.0)
|
100
|
+
metadata:
|
101
|
+
logstash_plugin: 'true'
|
102
|
+
logstash_group: input
|
103
|
+
post_install_message:
|
104
|
+
rdoc_options: []
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - '>='
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
requirements: []
|
118
|
+
rubyforge_project:
|
119
|
+
rubygems_version: 2.4.5
|
120
|
+
signing_key:
|
121
|
+
specification_version: 4
|
122
|
+
summary: Extends logstash-input-file to parse csv files, optionally respecting 'first-line schemas'
|
123
|
+
test_files:
|
124
|
+
- spec/inputs/csvfile_spec.rb
|