embulk-guess-csv_verify 0.10.29-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/classpath/embulk-guess-csv-0.10.29.jar +0 -0
- data/classpath/embulk-guess-csv_verify-0.10.29.jar +0 -0
- data/classpath/embulk-parser-csv-0.10.29.jar +0 -0
- data/classpath/embulk-util-config-0.2.1.jar +0 -0
- data/classpath/embulk-util-file-0.1.3.jar +0 -0
- data/classpath/embulk-util-guess-0.1.1.jar +0 -0
- data/classpath/embulk-util-json-0.1.0.jar +0 -0
- data/classpath/embulk-util-rubytime-0.3.2.jar +0 -0
- data/classpath/embulk-util-text-0.1.0.jar +0 -0
- data/classpath/embulk-util-timestamp-0.2.1.jar +0 -0
- data/classpath/icu4j-54.1.1.jar +0 -0
- data/classpath/jackson-annotations-2.6.7.jar +0 -0
- data/classpath/jackson-core-2.6.7.jar +0 -0
- data/classpath/jackson-databind-2.6.7.jar +0 -0
- data/classpath/jackson-datatype-jdk8-2.6.7.jar +0 -0
- data/classpath/validation-api-1.1.0.Final.jar +0 -0
- data/lib/embulk/guess/csv_verify.rb +474 -0
- metadata +64 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 560fef03cacdda19b9365be2a7352d5a00c47b415fb89cd1f7ae6664d0603091
|
4
|
+
data.tar.gz: 79ab9de8f15c812cd3093207329674370d37563bdc05360f2652072a15afe12c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ced3fb4071fa98eb20d4411272184217e9fc28e2c19e29192830d9fbcc158b67cdb87cfa7191c3fdf78794017009feef25982f60ae34bbae7cee08dc9257eafb
|
7
|
+
data.tar.gz: 357413c30817f29b0d2bdd1e22f093387374d4b1affff93e4ff88463f3468dfcfba4d1115da4f0f8daf48eb7f3a511ac012b0173af16a867fb1f2c63d9272bb6
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,474 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
require 'embulk/guess/schema_guess'
|
4
|
+
require 'embulk/logger'
|
5
|
+
|
6
|
+
class CsvGuessPlugin < LineGuessPlugin
|
7
|
+
Plugin.register_guess('csv_verify', self)
|
8
|
+
|
9
|
+
def self.create_classloader
|
10
|
+
jars = Dir["#{File.expand_path('../../../../classpath', __FILE__)}/**/*.jar"]
|
11
|
+
urls = jars.map {|jar| java.io.File.new(File.expand_path(jar)).toURI.toURL }
|
12
|
+
begin
|
13
|
+
expected_temporary_variable_name = Java::org.embulk.jruby.JRubyPluginSource::PLUGIN_CLASS_LOADER_FACTORY_VARIABLE_NAME
|
14
|
+
rescue => e
|
15
|
+
raise PluginLoadError.new "Java's org.embulk.jruby.JRubyPluginSource does not define PLUGIN_CLASS_LOADER_FACTORY_VARIABLE_NAME unexpectedly."
|
16
|
+
end
|
17
|
+
if expected_temporary_variable_name != "$temporary_internal_plugin_class_loader_factory__"
|
18
|
+
raise PluginLoadError.new "Java's org.embulk.jruby.JRubyPluginSource does not define PLUGIN_CLASS_LOADER_FACTORY_VARIABLE_NAME correctly."
|
19
|
+
end
|
20
|
+
factory = $temporary_internal_plugin_class_loader_factory__
|
21
|
+
factory.create(urls, JRuby.runtime.getJRubyClassLoader())
|
22
|
+
end
|
23
|
+
|
24
|
+
CLASSLOADER = create_classloader
|
25
|
+
CONFIG_MAPPER_FACTORY_CLASS = CLASSLOADER.loadClass("org.embulk.util.config.ConfigMapperFactory").ruby_class
|
26
|
+
TYPE_MODULE_CLASS = CLASSLOADER.loadClass("org.embulk.util.config.modules.TypeModule").ruby_class
|
27
|
+
CONFIG_MAPPER_FACTORY = CONFIG_MAPPER_FACTORY_CLASS.builder.addDefaultModules.addModule(TYPE_MODULE_CLASS.new).build
|
28
|
+
PLUGIN_TASK_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvParserPlugin$PluginTask")
|
29
|
+
LIST_FILE_INPUT_CLASS = CLASSLOADER.loadClass("org.embulk.util.file.ListFileInput").ruby_class
|
30
|
+
LINE_DECODER_CLASS = CLASSLOADER.loadClass("org.embulk.util.text.LineDecoder").ruby_class
|
31
|
+
CSV_GUESS_PLUGIN_CLASS = CLASSLOADER.loadClass("org.embulk.guess.csv.CsvGuessPlugin").ruby_class
|
32
|
+
CSV_TOKENIZER_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer").ruby_class
|
33
|
+
TOO_FEW_COLUMNS_EXCEPTION_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer$TooFewColumnsException").ruby_class
|
34
|
+
INVALID_VALUE_EXCEPTION_CLASS = CLASSLOADER.loadClass("org.embulk.parser.csv.CsvTokenizer$InvalidValueException").ruby_class
|
35
|
+
|
36
|
+
DELIMITER_CANDIDATES = [
|
37
|
+
",", "\t", "|", ";"
|
38
|
+
]
|
39
|
+
|
40
|
+
QUOTE_CANDIDATES = [
|
41
|
+
"\"", "'"
|
42
|
+
]
|
43
|
+
|
44
|
+
ESCAPE_CANDIDATES = [
|
45
|
+
"\\", '"'
|
46
|
+
]
|
47
|
+
|
48
|
+
NULL_STRING_CANDIDATES = [
|
49
|
+
"null",
|
50
|
+
"NULL",
|
51
|
+
"#N/A",
|
52
|
+
"\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
|
53
|
+
]
|
54
|
+
|
55
|
+
COMMENT_LINE_MARKER_CANDIDATES = [
|
56
|
+
"#",
|
57
|
+
"//",
|
58
|
+
]
|
59
|
+
|
60
|
+
MAX_SKIP_LINES = 10
|
61
|
+
NO_SKIP_DETECT_LINES = 10
|
62
|
+
|
63
|
+
def guess_lines(config, sample_lines)
|
64
|
+
guessed_ruby = guess_lines_iter(config, sample_lines)
|
65
|
+
|
66
|
+
begin
|
67
|
+
guess_plugin_java = CSV_GUESS_PLUGIN_CLASS.new
|
68
|
+
guessed_java = guess_plugin_java.guess_lines(config_to_java(config), config_to_java(sample_lines))
|
69
|
+
if guessed_java.nil?
|
70
|
+
raise "embulk-guess-csv (Java) returned null."
|
71
|
+
end
|
72
|
+
guessed_ruby_converted = config_to_java(guessed_ruby)
|
73
|
+
if !guessed_java.equals(guessed_ruby_converted)
|
74
|
+
raise_and_log_guess_diff(guessed_ruby, guessed_java)
|
75
|
+
end
|
76
|
+
rescue Exception => e
|
77
|
+
# Any error from the Java-based guess plugin should pass-through just with logging.
|
78
|
+
Embulk.logger.error "[Embulk CSV guess verify] #{e.inspect}"
|
79
|
+
end
|
80
|
+
|
81
|
+
# This plugin returns a result from the Ruby-based implementation.
|
82
|
+
return guessed_ruby
|
83
|
+
end
|
84
|
+
|
85
|
+
def guess_lines_iter(config, sample_lines)
|
86
|
+
return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"
|
87
|
+
|
88
|
+
parser_config = config["parser"] || {}
|
89
|
+
if parser_config["type"] == "csv" && parser_config["delimiter"]
|
90
|
+
delim = parser_config["delimiter"]
|
91
|
+
else
|
92
|
+
delim = guess_delimiter(sample_lines)
|
93
|
+
unless delim
|
94
|
+
# assuming single column CSV
|
95
|
+
delim = DELIMITER_CANDIDATES.first
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
parser_guessed = DataSource.new.merge(parser_config).merge({"type" => "csv", "delimiter" => delim})
|
100
|
+
|
101
|
+
unless parser_guessed.has_key?("quote")
|
102
|
+
quote = guess_quote(sample_lines, delim)
|
103
|
+
unless quote
|
104
|
+
if !guess_force_no_quote(sample_lines, delim, '"')
|
105
|
+
# assuming CSV follows RFC for quoting
|
106
|
+
quote = '"'
|
107
|
+
else
|
108
|
+
# disable quoting (set null)
|
109
|
+
end
|
110
|
+
end
|
111
|
+
parser_guessed["quote"] = quote
|
112
|
+
end
|
113
|
+
parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
|
114
|
+
|
115
|
+
unless parser_guessed.has_key?("escape")
|
116
|
+
if quote = parser_guessed["quote"]
|
117
|
+
escape = guess_escape(sample_lines, delim, quote)
|
118
|
+
unless escape
|
119
|
+
if quote == '"'
|
120
|
+
# assuming this CSV follows RFC for escaping
|
121
|
+
escape = '"'
|
122
|
+
else
|
123
|
+
# disable escaping (set null)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
parser_guessed["escape"] = escape
|
127
|
+
else
|
128
|
+
# escape does nothing if quote is disabled
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
unless parser_guessed.has_key?("null_string")
|
133
|
+
null_string = guess_null_string(sample_lines, delim)
|
134
|
+
parser_guessed["null_string"] = null_string if null_string
|
135
|
+
# don't even set null_string to avoid confusion of null and 'null' in YAML format
|
136
|
+
end
|
137
|
+
|
138
|
+
# guessing skip_header_lines should be before guessing guess_comment_line_marker
|
139
|
+
# because lines supplied to CsvTokenizer already don't include skipped header lines.
|
140
|
+
# skipping empty lines is also disabled here because skipping header lines is done by
|
141
|
+
# CsvParser which doesn't skip empty lines automatically
|
142
|
+
sample_records = split_lines(parser_guessed, false, sample_lines, delim, {})
|
143
|
+
skip_header_lines = guess_skip_header_lines(sample_records)
|
144
|
+
sample_lines = sample_lines[skip_header_lines..-1]
|
145
|
+
sample_records = sample_records[skip_header_lines..-1]
|
146
|
+
|
147
|
+
unless parser_guessed.has_key?("comment_line_marker")
|
148
|
+
comment_line_marker, sample_lines =
|
149
|
+
guess_comment_line_marker(sample_lines, delim, parser_guessed["quote"], parser_guessed["null_string"])
|
150
|
+
if comment_line_marker
|
151
|
+
parser_guessed["comment_line_marker"] = comment_line_marker
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
sample_records = split_lines(parser_guessed, true, sample_lines, delim, {})
|
156
|
+
|
157
|
+
# It should fail if CSV parser cannot parse sample_lines.
|
158
|
+
if sample_records.nil? || sample_records.empty?
|
159
|
+
return {}
|
160
|
+
end
|
161
|
+
|
162
|
+
if sample_lines.size == 1
|
163
|
+
# The file contains only 1 line. Assume that there are no header line.
|
164
|
+
header_line = false
|
165
|
+
|
166
|
+
column_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
167
|
+
|
168
|
+
unless parser_guessed.has_key?("trim_if_not_quoted")
|
169
|
+
sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
|
170
|
+
column_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed)
|
171
|
+
if column_types != column_types_trimmed
|
172
|
+
parser_guessed["trim_if_not_quoted"] = true
|
173
|
+
column_types = column_types_trimmed
|
174
|
+
else
|
175
|
+
parser_guessed["trim_if_not_quoted"] = false
|
176
|
+
end
|
177
|
+
end
|
178
|
+
else
|
179
|
+
# The file contains more than 1 line. If guessed first line's column types are all strings or boolean, and the types are
|
180
|
+
# different from the other lines, assume that the first line is column names.
|
181
|
+
first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
|
182
|
+
other_types = SchemaGuess.types_from_array_records(sample_records[1..-1] || [])
|
183
|
+
|
184
|
+
unless parser_guessed.has_key?("trim_if_not_quoted")
|
185
|
+
sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
|
186
|
+
other_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed[1..-1] || [])
|
187
|
+
if other_types != other_types_trimmed
|
188
|
+
parser_guessed["trim_if_not_quoted"] = true
|
189
|
+
other_types = other_types_trimmed
|
190
|
+
else
|
191
|
+
parser_guessed["trim_if_not_quoted"] = false
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
header_line = (first_types != other_types && first_types.all? {|t| ["string", "boolean"].include?(t) }) || guess_string_header_line(sample_records)
|
196
|
+
column_types = other_types
|
197
|
+
end
|
198
|
+
|
199
|
+
if column_types.empty?
|
200
|
+
# TODO here is making the guessing failed if the file doesn't contain any columns. However,
|
201
|
+
# this may not be convenient for users.
|
202
|
+
return {}
|
203
|
+
end
|
204
|
+
|
205
|
+
if header_line
|
206
|
+
parser_guessed["skip_header_lines"] = skip_header_lines + 1
|
207
|
+
else
|
208
|
+
parser_guessed["skip_header_lines"] = skip_header_lines
|
209
|
+
end
|
210
|
+
|
211
|
+
parser_guessed["allow_extra_columns"] = false unless parser_guessed.has_key?("allow_extra_columns")
|
212
|
+
parser_guessed["allow_optional_columns"] = false unless parser_guessed.has_key?("allow_optional_columns")
|
213
|
+
|
214
|
+
if header_line
|
215
|
+
column_names = sample_records.first.map(&:strip)
|
216
|
+
else
|
217
|
+
column_names = (0..column_types.size).to_a.map {|i| "c#{i}" }
|
218
|
+
end
|
219
|
+
schema = []
|
220
|
+
column_names.zip(column_types).each do |name,type|
|
221
|
+
if name && type
|
222
|
+
schema << new_column(name, type)
|
223
|
+
end
|
224
|
+
end
|
225
|
+
parser_guessed["columns"] = schema
|
226
|
+
|
227
|
+
return {"parser" => parser_guessed}
|
228
|
+
end
|
229
|
+
|
230
|
+
def new_column(name, type)
|
231
|
+
if type.is_a?(SchemaGuess::TimestampTypeMatch)
|
232
|
+
{"name" => name, "type" => type, "format" => type.format}
|
233
|
+
else
|
234
|
+
{"name" => name, "type" => type}
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
private
|
239
|
+
|
240
|
+
def raise_and_log_guess_diff(guessed_ruby_entire, guessed_java_entire)
|
241
|
+
guessed_ruby = guessed_ruby_entire["parser"] || {}
|
242
|
+
guessed_java = guessed_java_entire.getNestedOrGetEmpty("parser")
|
243
|
+
|
244
|
+
require 'set'
|
245
|
+
keys = Set.new(guessed_ruby.keys) + Set.new(guessed_java.getAttributeNames)
|
246
|
+
|
247
|
+
begin
|
248
|
+
require 'json'
|
249
|
+
rescue LoadError
|
250
|
+
Embulk.logger.warn "The 'json' gem is not installed. No details compared."
|
251
|
+
guessed_java_hash = nil
|
252
|
+
else
|
253
|
+
guessed_java_hash = JSON.parse(guessed_java.toJson)
|
254
|
+
end
|
255
|
+
|
256
|
+
diffs = []
|
257
|
+
keys.each do |key|
|
258
|
+
if !guessed_ruby.has_key?(key)
|
259
|
+
diffs << "Only embulk-guess-csv (Java) has: \"#{key}\""
|
260
|
+
elsif !guessed_java.has(key.to_java)
|
261
|
+
diffs << "Only embulk-guess-csv (Ruby) has: \"#{key}\""
|
262
|
+
elsif guessed_java_hash && guessed_ruby[key] != guessed_java_hash[key]
|
263
|
+
diffs << "embulk-guess-csv has difference between Java/Ruby: \"#{key}\""
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
raise "embulk-guess-csv has difference between Java/Ruby: #{diffs.inspect}"
|
268
|
+
end
|
269
|
+
|
270
|
+
def config_to_java(config_ruby)
|
271
|
+
case config_ruby
|
272
|
+
when Hash then
|
273
|
+
config_java = CONFIG_MAPPER_FACTORY.newConfigSource
|
274
|
+
config_ruby.each do |key, value|
|
275
|
+
config_java.set(key.to_java, config_to_java(value))
|
276
|
+
end
|
277
|
+
return config_java
|
278
|
+
when Array then
|
279
|
+
config_java = Java::java.util.ArrayList.new
|
280
|
+
config_ruby.each do |v|
|
281
|
+
config_java.add(config_to_java(v))
|
282
|
+
end
|
283
|
+
return Java::java.util.Collections.unmodifiableList(config_java)
|
284
|
+
else
|
285
|
+
return config_ruby.to_java
|
286
|
+
end
|
287
|
+
end
|
288
|
+
|
289
|
+
def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
|
290
|
+
null_string = parser_config["null_string"]
|
291
|
+
config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
|
292
|
+
parser_task = CONFIG_MAPPER_FACTORY.createConfigMapper.map(config_to_java(parser_config), PLUGIN_TASK_CLASS)
|
293
|
+
data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
|
294
|
+
sample = Buffer.from_ruby_string(data)
|
295
|
+
decoder = LINE_DECODER_CLASS.of(
|
296
|
+
LIST_FILE_INPUT_CLASS.new([[sample.to_java]]), parser_task.getCharset, parser_task.getLineDelimiterRecognized.orElse(nil))
|
297
|
+
tokenizer = CSV_TOKENIZER_CLASS.new(decoder, parser_task)
|
298
|
+
rows = []
|
299
|
+
while tokenizer.nextFile
|
300
|
+
while tokenizer.nextRecord(skip_empty_lines)
|
301
|
+
begin
|
302
|
+
columns = []
|
303
|
+
while true
|
304
|
+
begin
|
305
|
+
column = tokenizer.nextColumn
|
306
|
+
quoted = tokenizer.wasQuotedColumn
|
307
|
+
if null_string && !quoted && column == null_string
|
308
|
+
column = nil
|
309
|
+
end
|
310
|
+
columns << column
|
311
|
+
rescue TOO_FEW_COLUMNS_EXCEPTION_CLASS
|
312
|
+
rows << columns
|
313
|
+
break
|
314
|
+
end
|
315
|
+
end
|
316
|
+
rescue INVALID_VALUE_EXCEPTION_CLASS
|
317
|
+
# TODO warning
|
318
|
+
tokenizer.skipCurrentLine
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
return rows
|
323
|
+
rescue
|
324
|
+
# TODO warning if fallback to this ad-hoc implementation
|
325
|
+
sample_lines.map {|line| line.split(delim) }
|
326
|
+
end
|
327
|
+
|
328
|
+
def guess_delimiter(sample_lines)
|
329
|
+
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
330
|
+
counts = sample_lines.map {|line| line.count(d) }
|
331
|
+
total = array_sum(counts)
|
332
|
+
if total > 0
|
333
|
+
stddev = array_standard_deviation(counts)
|
334
|
+
stddev = 0.000000001 if stddev == 0.0
|
335
|
+
weight = total / stddev
|
336
|
+
[d, weight]
|
337
|
+
else
|
338
|
+
[nil, 0]
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
|
343
|
+
if delim != nil && weight > 1
|
344
|
+
return delim
|
345
|
+
else
|
346
|
+
return nil
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
def guess_quote(sample_lines, delim)
|
351
|
+
delim_regexp = Regexp.escape(delim)
|
352
|
+
quote_weights = QUOTE_CANDIDATES.map do |q|
|
353
|
+
weights = sample_lines.map do |line|
|
354
|
+
q_regexp = Regexp.escape(q)
|
355
|
+
count = line.count(q)
|
356
|
+
if count > 0
|
357
|
+
weight = count
|
358
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
|
359
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
|
360
|
+
weight
|
361
|
+
else
|
362
|
+
nil
|
363
|
+
end
|
364
|
+
end.compact
|
365
|
+
weights.empty? ? 0 : array_avg(weights)
|
366
|
+
end
|
367
|
+
quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
|
368
|
+
if weight >= 10.0
|
369
|
+
return quote
|
370
|
+
else
|
371
|
+
return nil
|
372
|
+
end
|
373
|
+
end
|
374
|
+
|
375
|
+
def guess_force_no_quote(sample_lines, delim, quote_candidate)
|
376
|
+
delim_regexp = Regexp.escape(delim)
|
377
|
+
q_regexp = Regexp.escape(quote_candidate)
|
378
|
+
sample_lines.any? do |line|
|
379
|
+
# quoting character appear at the middle of a non-quoted value
|
380
|
+
line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
def guess_escape(sample_lines, delim, quote)
|
385
|
+
guessed = ESCAPE_CANDIDATES.map do |str|
|
386
|
+
regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
|
387
|
+
counts = sample_lines.map {|line| line.scan(regexp).count }
|
388
|
+
count = counts.inject(0) {|r,c| r + c }
|
389
|
+
[str, count]
|
390
|
+
end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
|
391
|
+
found = guessed.first
|
392
|
+
return found ? found[0] : nil
|
393
|
+
end
|
394
|
+
|
395
|
+
def guess_null_string(sample_lines, delim)
|
396
|
+
guessed = NULL_STRING_CANDIDATES.map do |str|
|
397
|
+
regexp = /(?:^|#{Regexp.quote(delim)})#{Regexp.quote(str)}(?:$|#{Regexp.quote(delim)})/
|
398
|
+
counts = sample_lines.map {|line| line.scan(regexp).count }
|
399
|
+
count = counts.inject(0) {|r,c| r + c }
|
400
|
+
[str, count]
|
401
|
+
end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
|
402
|
+
found_str, found_count = guessed.first
|
403
|
+
return found_str ? found_str : nil
|
404
|
+
end
|
405
|
+
|
406
|
+
def guess_skip_header_lines(sample_records)
|
407
|
+
counts = sample_records.map {|records| records.size }
|
408
|
+
(1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
|
409
|
+
check_row_count = counts[i-1]
|
410
|
+
if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
|
411
|
+
return i - 1
|
412
|
+
end
|
413
|
+
end
|
414
|
+
return 0
|
415
|
+
end
|
416
|
+
|
417
|
+
def guess_comment_line_marker(sample_lines, delim, quote, null_string)
|
418
|
+
exclude = []
|
419
|
+
exclude << /^#{Regexp.escape(quote)}/ if quote && !quote.empty?
|
420
|
+
exclude << /^#{Regexp.escape(null_string)}(?:#{Regexp.escape(delim)}|$)/ if null_string
|
421
|
+
|
422
|
+
guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
|
423
|
+
regexp = /^#{Regexp.quote(str)}/
|
424
|
+
unmatch_lines = sample_lines.reject do |line|
|
425
|
+
exclude.all? {|ex| line !~ ex } && line =~ regexp
|
426
|
+
end
|
427
|
+
match_count = sample_lines.size - unmatch_lines.size
|
428
|
+
[str, match_count, unmatch_lines]
|
429
|
+
end.select {|str,match_count,unmatch_lines| match_count > 0 }.sort_by {|str,match_count,unmatch_lines| -match_count }
|
430
|
+
|
431
|
+
str, match_count, unmatch_lines = guessed.first
|
432
|
+
if str
|
433
|
+
return str, unmatch_lines
|
434
|
+
else
|
435
|
+
return nil, sample_lines
|
436
|
+
end
|
437
|
+
end
|
438
|
+
|
439
|
+
def guess_string_header_line(sample_records)
|
440
|
+
first = sample_records.first
|
441
|
+
first.count.times do |column_index|
|
442
|
+
lengths = sample_records.map {|row| row[column_index] }.compact.map {|v| v.to_s.size }
|
443
|
+
if lengths.size > 1
|
444
|
+
if array_variance(lengths[1..-1]) <= 0.2
|
445
|
+
avg = array_avg(lengths[1..-1])
|
446
|
+
if avg == 0.0 ? lengths[0] > 1 : (avg - lengths[0]).abs / avg > 0.7
|
447
|
+
return true
|
448
|
+
end
|
449
|
+
end
|
450
|
+
end
|
451
|
+
end
|
452
|
+
return false
|
453
|
+
end
|
454
|
+
|
455
|
+
def array_sum(array)
|
456
|
+
array.inject(0) {|r,i| r += i }
|
457
|
+
end
|
458
|
+
|
459
|
+
def array_avg(array)
|
460
|
+
array.inject(0.0) {|r,i| r += i } / array.size
|
461
|
+
end
|
462
|
+
|
463
|
+
def array_variance(array)
|
464
|
+
avg = array_avg(array)
|
465
|
+
array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
|
466
|
+
end
|
467
|
+
|
468
|
+
def array_standard_deviation(array)
|
469
|
+
Math.sqrt(array_variance(array))
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
473
|
+
end
|
474
|
+
end
|
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: embulk-guess-csv_verify
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.10.29
|
5
|
+
platform: java
|
6
|
+
authors:
|
7
|
+
- Sadayuki Furuhashi
|
8
|
+
- Muga Nishizawa
|
9
|
+
- Dai MIKURUBE
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2021-04-07 00:00:00.000000000 Z
|
14
|
+
dependencies: []
|
15
|
+
description: Verification-purpose Embulk CSV guess plugin to compare the old Ruby-based
|
16
|
+
one and the new Java-based one (not for your production use)
|
17
|
+
email:
|
18
|
+
- dmikurube@treasure-data.com
|
19
|
+
executables: []
|
20
|
+
extensions: []
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- classpath/embulk-guess-csv-0.10.29.jar
|
24
|
+
- classpath/embulk-guess-csv_verify-0.10.29.jar
|
25
|
+
- classpath/embulk-parser-csv-0.10.29.jar
|
26
|
+
- classpath/embulk-util-config-0.2.1.jar
|
27
|
+
- classpath/embulk-util-file-0.1.3.jar
|
28
|
+
- classpath/embulk-util-guess-0.1.1.jar
|
29
|
+
- classpath/embulk-util-json-0.1.0.jar
|
30
|
+
- classpath/embulk-util-rubytime-0.3.2.jar
|
31
|
+
- classpath/embulk-util-text-0.1.0.jar
|
32
|
+
- classpath/embulk-util-timestamp-0.2.1.jar
|
33
|
+
- classpath/icu4j-54.1.1.jar
|
34
|
+
- classpath/jackson-annotations-2.6.7.jar
|
35
|
+
- classpath/jackson-core-2.6.7.jar
|
36
|
+
- classpath/jackson-databind-2.6.7.jar
|
37
|
+
- classpath/jackson-datatype-jdk8-2.6.7.jar
|
38
|
+
- classpath/validation-api-1.1.0.Final.jar
|
39
|
+
- lib/embulk/guess/csv_verify.rb
|
40
|
+
homepage: https://github.com/embulk/embulk
|
41
|
+
licenses:
|
42
|
+
- Apache-2.0
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 2.7.9
|
61
|
+
signing_key:
|
62
|
+
specification_version: 4
|
63
|
+
summary: Verification-purpose Embulk CSV guess plugin (not for your production use)
|
64
|
+
test_files: []
|