embulk 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/build.gradle +8 -2
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +5 -8
- data/embulk-docs/push-gh-pages.sh +1 -1
- data/embulk-docs/src/release.rst +2 -0
- data/embulk-docs/src/release/release-0.4.1.rst +2 -2
- data/embulk-docs/src/release/release-0.4.2.rst +18 -0
- data/lib/embulk/command/embulk_new_plugin.rb +9 -9
- data/lib/embulk/data/new/java/plugin_loader.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/filter.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/gemspec.erb +2 -2
- data/lib/embulk/data/new/ruby/input.rb.erb +1 -1
- data/lib/embulk/guess/charset.rb +28 -0
- data/lib/embulk/guess/csv.rb +206 -0
- data/lib/embulk/guess/gzip.rb +18 -0
- data/lib/embulk/guess/newline.rb +22 -0
- data/lib/embulk/guess/time_format_guess.rb +333 -0
- data/lib/embulk/version.rb +1 -1
- metadata +9 -8
- data/lib/embulk/guess_charset.rb +0 -26
- data/lib/embulk/guess_csv.rb +0 -204
- data/lib/embulk/guess_gzip.rb +0 -16
- data/lib/embulk/guess_newline.rb +0 -20
- data/lib/embulk/time_format_guess.rb +0 -331
@@ -0,0 +1,22 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class NewlineGuessPlugin < TextGuessPlugin
|
5
|
+
Plugin.register_guess('newline', self)
|
6
|
+
|
7
|
+
def guess_text(config, sample_text)
|
8
|
+
cr_count = sample_text.count("\r")
|
9
|
+
lf_count = sample_text.count("\n")
|
10
|
+
crlf_count = sample_text.scan(/\r\n/).length
|
11
|
+
if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
|
12
|
+
return {"parser" => {"newline" => "CRLF"}}
|
13
|
+
elsif cr_count > lf_count / 2
|
14
|
+
return {"parser" => {"newline" => "CR"}}
|
15
|
+
else
|
16
|
+
return {"parser" => {"newline" => "LF"}}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,333 @@
|
|
1
|
+
module Embulk::Guess
|
2
|
+
module TimeFormatGuess
|
3
|
+
module Parts
|
4
|
+
YEAR = /[1-4][0-9]{3}/
|
5
|
+
MONTH = /10|11|12|[0 ]?[0-9]/
|
6
|
+
MONTH_NODELIM = /10|11|12|[0][0-9]/
|
7
|
+
DAY = /[1-2][0-9]|[0 ]?[1-9]|30|31/
|
8
|
+
DAY_NODELIM = /[1-2][0-9]|[0][1-9]|30|31/
|
9
|
+
HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/
|
10
|
+
HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/
|
11
|
+
MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/
|
12
|
+
MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/
|
13
|
+
|
14
|
+
MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/
|
15
|
+
MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/
|
16
|
+
|
17
|
+
WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/
|
18
|
+
WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/
|
19
|
+
end
|
20
|
+
|
21
|
+
class GuessMatch
|
22
|
+
def initialize(delimiters, parts, part_options)
|
23
|
+
@delimiters = delimiters
|
24
|
+
@parts = parts
|
25
|
+
@part_options = part_options
|
26
|
+
end
|
27
|
+
|
28
|
+
def format
|
29
|
+
format = ''
|
30
|
+
@parts.size.times do |i|
|
31
|
+
format << @delimiters[i-1] if i != 0
|
32
|
+
option = @part_options[i]
|
33
|
+
|
34
|
+
case @parts[i]
|
35
|
+
when :year
|
36
|
+
format << '%Y'
|
37
|
+
|
38
|
+
when :month
|
39
|
+
case option
|
40
|
+
when :zero
|
41
|
+
format << '%m'
|
42
|
+
when :blank
|
43
|
+
#format << '%_m' # not supported
|
44
|
+
format << '%m'
|
45
|
+
when :none
|
46
|
+
#format << '%-m' # not supported
|
47
|
+
format << '%m'
|
48
|
+
else
|
49
|
+
format << '%m'
|
50
|
+
end
|
51
|
+
|
52
|
+
when :day
|
53
|
+
case option
|
54
|
+
when :zero
|
55
|
+
format << '%d'
|
56
|
+
when :blank
|
57
|
+
format << '%e'
|
58
|
+
when :none
|
59
|
+
format << '%d' # not supported
|
60
|
+
else
|
61
|
+
format << '%d'
|
62
|
+
end
|
63
|
+
|
64
|
+
when :hour
|
65
|
+
case option
|
66
|
+
when :zero
|
67
|
+
format << '%H'
|
68
|
+
when :blank
|
69
|
+
format << '%k'
|
70
|
+
when :none
|
71
|
+
format << '%k' # not supported
|
72
|
+
else
|
73
|
+
format << '%H'
|
74
|
+
end
|
75
|
+
|
76
|
+
when :minute
|
77
|
+
# heading options are not supported
|
78
|
+
format << '%M'
|
79
|
+
|
80
|
+
when :second
|
81
|
+
# heading options are not supported
|
82
|
+
format << '%S'
|
83
|
+
|
84
|
+
when :frac
|
85
|
+
if option <= 3
|
86
|
+
format << '%L'
|
87
|
+
#elsif option <= 6
|
88
|
+
# format << '%6N'
|
89
|
+
#elsif option <= 6
|
90
|
+
# format << '%6N'
|
91
|
+
#elsif option <= 9
|
92
|
+
# format << '%9N'
|
93
|
+
#elsif option <= 12
|
94
|
+
# format << '%12N'
|
95
|
+
#elsif option <= 15
|
96
|
+
# format << '%15N'
|
97
|
+
#elsif option <= 18
|
98
|
+
# format << '%18N'
|
99
|
+
#elsif option <= 21
|
100
|
+
# format << '%21N'
|
101
|
+
#elsif option <= 24
|
102
|
+
# format << '%24N'
|
103
|
+
else
|
104
|
+
format << '%N'
|
105
|
+
end
|
106
|
+
|
107
|
+
when :zone_off
|
108
|
+
format << '%z'
|
109
|
+
|
110
|
+
when :zone_abb
|
111
|
+
format << '%Z'
|
112
|
+
|
113
|
+
else
|
114
|
+
raise "Unknown part: #{@parts[i]}"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
return format
|
119
|
+
end
|
120
|
+
|
121
|
+
def mergeable_group
|
122
|
+
[@delimiters, @parts]
|
123
|
+
end
|
124
|
+
|
125
|
+
attr_reader :part_options
|
126
|
+
|
127
|
+
def merge!(another_in_group)
|
128
|
+
part_options = another_in_group.part_options
|
129
|
+
@part_options.size.times do |i|
|
130
|
+
@part_options[i] ||= part_options[i]
|
131
|
+
if @part_options[i] == nil
|
132
|
+
part_options[i]
|
133
|
+
elsif part_options[i] == nil
|
134
|
+
@part_options[i]
|
135
|
+
else
|
136
|
+
[@part_options[i], part_options[i]].sort.last
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
class GuessPattern
|
143
|
+
include Parts
|
144
|
+
|
145
|
+
date_delims = /[\/\-]/
|
146
|
+
# yyyy-MM-dd
|
147
|
+
YMD = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
|
148
|
+
YMD_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
|
149
|
+
# dd/MM/yyyy
|
150
|
+
DMY = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
|
151
|
+
DMY_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
|
152
|
+
|
153
|
+
frac = /[0-9]{1,24}/
|
154
|
+
time_delims = /[\:\-]/
|
155
|
+
frac_delims = /[\.\,]/
|
156
|
+
TIME = /(?<hour>#{HOUR})(?<time_delim>#{time_delims})(?<minute>#{MINUTE})(?:\k<time_delim>(?<second>#{SECOND})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
|
157
|
+
TIME_NODELIM = /(?<hour>#{HOUR_NODELIM})(?<minute>#{MINUTE_NODELIM})((?<second>#{SECOND_NODELIM})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
|
158
|
+
|
159
|
+
TZ = /(?<zone_space> )?(?<zone>(?<zone_off>[\-\+]\d\d(?::?\d\d)?)|(?<zone_abb>[A-Z]{3}))|(?<z>Z)/
|
160
|
+
|
161
|
+
def match(text)
|
162
|
+
delimiters = []
|
163
|
+
parts = []
|
164
|
+
part_options = []
|
165
|
+
|
166
|
+
if dm = (/^#{YMD}(?<rest>.*?)$/.match(text) or /^#{YMD_NODELIM}(?<rest>.*?)$/.match(text))
|
167
|
+
date_delim = dm["date_delim"] rescue ""
|
168
|
+
|
169
|
+
parts << :year
|
170
|
+
part_options << nil
|
171
|
+
delimiters << date_delim
|
172
|
+
|
173
|
+
parts << :month
|
174
|
+
part_options << part_heading_option(dm["month"])
|
175
|
+
delimiters << date_delim
|
176
|
+
|
177
|
+
parts << :day
|
178
|
+
part_options << part_heading_option(dm["day"])
|
179
|
+
|
180
|
+
elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
|
181
|
+
date_delim = dm["date_delim"] rescue ""
|
182
|
+
|
183
|
+
parts << :day
|
184
|
+
part_options << part_heading_option(dm["day"])
|
185
|
+
delimiters << date_delim
|
186
|
+
|
187
|
+
parts << :month
|
188
|
+
part_options << part_heading_option(dm["month"])
|
189
|
+
delimiters << date_delim
|
190
|
+
|
191
|
+
parts << :year
|
192
|
+
part_options << nil
|
193
|
+
delimiters << date_delim
|
194
|
+
|
195
|
+
else
|
196
|
+
date_delim = ""
|
197
|
+
return nil
|
198
|
+
end
|
199
|
+
rest = dm["rest"]
|
200
|
+
|
201
|
+
date_time_delims = /[ _T]/
|
202
|
+
if tm = (
|
203
|
+
/^(?<date_time_delim>#{date_time_delims})#{TIME}(?<rest>.*?)?$/.match(rest) or
|
204
|
+
/^(?<date_time_delim>#{date_time_delims})#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest) or
|
205
|
+
(date_delim == "" && /^#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest))
|
206
|
+
)
|
207
|
+
date_time_delim = tm["date_time_delim"] rescue ""
|
208
|
+
time_delim = tm["time_delim"] rescue ""
|
209
|
+
|
210
|
+
delimiters << date_time_delim
|
211
|
+
parts << :hour
|
212
|
+
part_options << part_heading_option(tm["hour"])
|
213
|
+
|
214
|
+
delimiters << time_delim
|
215
|
+
parts << :minute
|
216
|
+
part_options << part_heading_option(tm["minute"])
|
217
|
+
|
218
|
+
if tm["second"]
|
219
|
+
delimiters << time_delim
|
220
|
+
parts << :second
|
221
|
+
part_options << part_heading_option(tm["second"])
|
222
|
+
end
|
223
|
+
|
224
|
+
if tm["frac"]
|
225
|
+
delimiters << tm["frac_delim"]
|
226
|
+
parts << :frac
|
227
|
+
part_options << tm["frac"].size
|
228
|
+
end
|
229
|
+
|
230
|
+
rest = tm["rest"]
|
231
|
+
end
|
232
|
+
|
233
|
+
if zm = /^#{TZ}$/.match(rest)
|
234
|
+
delimiters << zm["zone_space"] || ''
|
235
|
+
if zm["z"]
|
236
|
+
# TODO ISO 8601
|
237
|
+
parts << :zone_off
|
238
|
+
elsif zm["zone_off"]
|
239
|
+
parts << :zone_off
|
240
|
+
else
|
241
|
+
parts << :zone_abb
|
242
|
+
end
|
243
|
+
part_options << nil
|
244
|
+
|
245
|
+
return GuessMatch.new(delimiters, parts, part_options)
|
246
|
+
|
247
|
+
elsif rest =~ /^\s*$/
|
248
|
+
return GuessMatch.new(delimiters, parts, part_options)
|
249
|
+
|
250
|
+
else
|
251
|
+
return nil
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
def part_heading_option(text)
|
256
|
+
if text[0] == '0'
|
257
|
+
:zero
|
258
|
+
elsif text[0] == ' '
|
259
|
+
:blank
|
260
|
+
elsif text.size == 1
|
261
|
+
:none
|
262
|
+
else
|
263
|
+
nil
|
264
|
+
end
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
class RegexpMatch
|
269
|
+
def initialize(format)
|
270
|
+
@format
|
271
|
+
end
|
272
|
+
|
273
|
+
attr_reader :format
|
274
|
+
|
275
|
+
def mergeable_group
|
276
|
+
@format
|
277
|
+
end
|
278
|
+
|
279
|
+
def merge!(another_in_group)
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
283
|
+
class RegexpPattern
|
284
|
+
def initialize(regexp, format)
|
285
|
+
@regexp = regexp
|
286
|
+
@match = RegexpMatch.new(format)
|
287
|
+
end
|
288
|
+
|
289
|
+
def match(text)
|
290
|
+
if @regexp =~ text
|
291
|
+
return @match
|
292
|
+
else
|
293
|
+
return nil
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
module StandardPatterns
|
299
|
+
include Parts
|
300
|
+
|
301
|
+
RFC_822_1123 = /^#{WEEKDAY_NAME_SHORT}, \d\d #{MONTH_NAME_SHORT} \d\d\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
|
302
|
+
RFC_850_1035 = /^#{WEEKDAY_NAME_FULL}, \d\d-#{MONTH_NAME_SHORT}-\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
|
303
|
+
APACHE_CLF = /^\d\d\/#{MONTH_NAME_SHORT}\/\d\d\d\d \d\d:\d\d:\d\d [\-\+]\d\d(?::?\d\d)?$/
|
304
|
+
ANSI_C_ASCTIME = /^#{WEEKDAY_NAME_SHORT} #{MONTH_NAME_SHORT} \d\d? \d\d:\d\d:\d\d \d\d\d\d$/
|
305
|
+
end
|
306
|
+
|
307
|
+
PATTERNS = [
|
308
|
+
GuessPattern.new,
|
309
|
+
RegexpPattern.new(StandardPatterns::RFC_822_1123, "%a, %d %b %Y %H:%M:%S %z"),
|
310
|
+
RegexpPattern.new(StandardPatterns::RFC_850_1035, "%A, %d-%b-%y %H:%M:%S %z"),
|
311
|
+
RegexpPattern.new(StandardPatterns::APACHE_CLF, "%d/%b/%Y %H:%M:%S %Z"),
|
312
|
+
RegexpPattern.new(StandardPatterns::ANSI_C_ASCTIME, "$a %b %e %H:%M:%S %Y"),
|
313
|
+
]
|
314
|
+
|
315
|
+
def self.guess(texts)
|
316
|
+
texts = Array(texts).select {|text| text != "" }
|
317
|
+
matches = texts.map do |text|
|
318
|
+
PATTERNS.map {|pattern| pattern.match(text) }.compact
|
319
|
+
end.flatten
|
320
|
+
if matches.empty?
|
321
|
+
return nil
|
322
|
+
elsif matches.size == 1
|
323
|
+
return matches[0].format
|
324
|
+
else
|
325
|
+
match_groups = matches.group_by {|match| match.mergeable_group }
|
326
|
+
best_match_group = match_groups.sort_by {|group| group.size }.last[1]
|
327
|
+
best_match = best_match_group.shift
|
328
|
+
best_match_group.each {|m| best_match.merge!(m) }
|
329
|
+
return best_match.format
|
330
|
+
end
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
data/lib/embulk/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
@@ -267,6 +267,7 @@ files:
|
|
267
267
|
- embulk-docs/src/release/release-0.3.2.rst
|
268
268
|
- embulk-docs/src/release/release-0.4.0.rst
|
269
269
|
- embulk-docs/src/release/release-0.4.1.rst
|
270
|
+
- embulk-docs/src/release/release-0.4.2.rst
|
270
271
|
- embulk-standards/build.gradle
|
271
272
|
- embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
|
272
273
|
- embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
|
@@ -335,10 +336,11 @@ files:
|
|
335
336
|
- lib/embulk/file_output_plugin.rb
|
336
337
|
- lib/embulk/filter_plugin.rb
|
337
338
|
- lib/embulk/formatter_plugin.rb
|
338
|
-
- lib/embulk/
|
339
|
-
- lib/embulk/
|
340
|
-
- lib/embulk/
|
341
|
-
- lib/embulk/
|
339
|
+
- lib/embulk/guess/charset.rb
|
340
|
+
- lib/embulk/guess/csv.rb
|
341
|
+
- lib/embulk/guess/gzip.rb
|
342
|
+
- lib/embulk/guess/newline.rb
|
343
|
+
- lib/embulk/guess/time_format_guess.rb
|
342
344
|
- lib/embulk/guess_plugin.rb
|
343
345
|
- lib/embulk/input_plugin.rb
|
344
346
|
- lib/embulk/java/bootstrap.rb
|
@@ -352,7 +354,6 @@ files:
|
|
352
354
|
- lib/embulk/plugin.rb
|
353
355
|
- lib/embulk/plugin_registry.rb
|
354
356
|
- lib/embulk/schema.rb
|
355
|
-
- lib/embulk/time_format_guess.rb
|
356
357
|
- lib/embulk/version.rb
|
357
358
|
- settings.gradle
|
358
359
|
- classpath/annotations-3.0.0.jar
|
@@ -361,8 +362,8 @@ files:
|
|
361
362
|
- classpath/bval-jsr303-0.5.jar
|
362
363
|
- classpath/commons-beanutils-core-1.8.3.jar
|
363
364
|
- classpath/commons-lang3-3.1.jar
|
364
|
-
- classpath/embulk-core-0.4.
|
365
|
-
- classpath/embulk-standards-0.4.
|
365
|
+
- classpath/embulk-core-0.4.2.jar
|
366
|
+
- classpath/embulk-standards-0.4.2.jar
|
366
367
|
- classpath/guava-18.0.jar
|
367
368
|
- classpath/guice-3.0.jar
|
368
369
|
- classpath/guice-multibindings-3.0.jar
|
data/lib/embulk/guess_charset.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
module Embulk
|
2
|
-
|
3
|
-
class GuessCharset < GuessPlugin
|
4
|
-
Plugin.register_guess('charset', self)
|
5
|
-
|
6
|
-
def guess(config, sample_buffer)
|
7
|
-
# ICU4J
|
8
|
-
detector = com.ibm.icu.text.CharsetDetector.new
|
9
|
-
detector.setText(sample_buffer.to_java_bytes)
|
10
|
-
best_match = detector.detect
|
11
|
-
if best_match.getConfidence < 50
|
12
|
-
name = "UTF-8"
|
13
|
-
else
|
14
|
-
name = best_match.getName
|
15
|
-
if name == "ISO-8859-1"
|
16
|
-
# ISO-8859-1 means ASCII which is a subset
|
17
|
-
# of UTF-8 in most of cases due to lack of
|
18
|
-
# sample data set
|
19
|
-
name = "UTF-8"
|
20
|
-
end
|
21
|
-
end
|
22
|
-
return {"parser" => {"charset" => name}}
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
data/lib/embulk/guess_csv.rb
DELETED
@@ -1,204 +0,0 @@
|
|
1
|
-
module Embulk
|
2
|
-
require_relative 'time_format_guess'
|
3
|
-
|
4
|
-
class GuessCsv < LineGuessPlugin
|
5
|
-
Plugin.register_guess('csv', self)
|
6
|
-
|
7
|
-
DELIMITER_CANDIDATES = [
|
8
|
-
",", "\t", "|"
|
9
|
-
]
|
10
|
-
|
11
|
-
QUOTE_CANDIDATES = [
|
12
|
-
"\"", "'"
|
13
|
-
]
|
14
|
-
|
15
|
-
# CsvParserPlugin.TRUE_STRINGS
|
16
|
-
TRUE_STRINGS = Hash[*%w[
|
17
|
-
true True TRUE
|
18
|
-
yes Yes YES
|
19
|
-
y Y
|
20
|
-
on On ON
|
21
|
-
1
|
22
|
-
].map {|k| [k, true] }]
|
23
|
-
|
24
|
-
def guess_lines(config, sample_lines)
|
25
|
-
delim = guess_delimiter(sample_lines)
|
26
|
-
unless delim
|
27
|
-
# not CSV file
|
28
|
-
return {}
|
29
|
-
end
|
30
|
-
|
31
|
-
parser_config = config["parser"] || {}
|
32
|
-
parser_guessed = {"type" => "csv", "delimiter" => delim}
|
33
|
-
|
34
|
-
quote = guess_quote(sample_lines, delim)
|
35
|
-
parser_guessed["quote"] = quote ? quote : ''
|
36
|
-
|
37
|
-
sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
|
38
|
-
first_types = guess_field_types(sample_records[0, 1])
|
39
|
-
other_types = guess_field_types(sample_records[1..-1])
|
40
|
-
|
41
|
-
if first_types.size <= 1 || other_types.size <= 1
|
42
|
-
# guess failed
|
43
|
-
return {}
|
44
|
-
end
|
45
|
-
|
46
|
-
unless parser_config.has_key?("header_line")
|
47
|
-
parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
|
48
|
-
end
|
49
|
-
|
50
|
-
unless parser_config.has_key?("columns")
|
51
|
-
if parser_guessed["header_line"] || parser_config["header_line"]
|
52
|
-
column_names = sample_records.first
|
53
|
-
else
|
54
|
-
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
|
55
|
-
end
|
56
|
-
schema = []
|
57
|
-
column_names.zip(other_types).each do |name,(type,format)|
|
58
|
-
if name && type
|
59
|
-
if format
|
60
|
-
schema << {"name" => name, "type" => type, "format" => format}
|
61
|
-
else
|
62
|
-
schema << {"name" => name, "type" => type}
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
parser_guessed["columns"] = schema
|
67
|
-
end
|
68
|
-
|
69
|
-
return {"parser" => parser_guessed}
|
70
|
-
end
|
71
|
-
|
72
|
-
private
|
73
|
-
|
74
|
-
def guess_delimiter(sample_lines)
|
75
|
-
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
76
|
-
counts = sample_lines.map {|line| line.count(d) }
|
77
|
-
total = array_sum(counts)
|
78
|
-
if total > 0
|
79
|
-
stddev = array_standard_deviation(counts)
|
80
|
-
stddev = 0.000000001 if stddev == 0.0
|
81
|
-
weight = total / stddev
|
82
|
-
[d, weight]
|
83
|
-
else
|
84
|
-
[nil, 0]
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
|
89
|
-
if delim != nil && weight > 1
|
90
|
-
return delim
|
91
|
-
else
|
92
|
-
return nil
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
def guess_quote(sample_lines, delim)
|
97
|
-
delim_regexp = Regexp.escape(delim)
|
98
|
-
quote_weights = QUOTE_CANDIDATES.map do |q|
|
99
|
-
weights = sample_lines.map do |line|
|
100
|
-
q_regexp = Regexp.escape(q)
|
101
|
-
count = line.count(q)
|
102
|
-
if count > 0
|
103
|
-
weight = count
|
104
|
-
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
|
105
|
-
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
|
106
|
-
weight
|
107
|
-
else
|
108
|
-
nil
|
109
|
-
end
|
110
|
-
end.compact
|
111
|
-
weights.empty? ? 0 : array_avg(weights)
|
112
|
-
end
|
113
|
-
quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
|
114
|
-
if weight >= 10.0
|
115
|
-
return quote
|
116
|
-
else
|
117
|
-
return nil
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
def guess_field_types(field_lines)
|
122
|
-
column_lines = []
|
123
|
-
field_lines.each do |fields|
|
124
|
-
fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
|
125
|
-
end
|
126
|
-
columns = column_lines.map do |types|
|
127
|
-
t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
|
128
|
-
if t.is_a?(TimestampMatch)
|
129
|
-
format = TimeFormatGuess.guess(types.map {|type| type.text })
|
130
|
-
["timestamp", format]
|
131
|
-
else
|
132
|
-
[t]
|
133
|
-
end
|
134
|
-
end
|
135
|
-
return columns
|
136
|
-
end
|
137
|
-
|
138
|
-
TYPE_COALESCE = Hash[{
|
139
|
-
long: :double,
|
140
|
-
boolean: :long,
|
141
|
-
}.map {|k,v|
|
142
|
-
[[k.to_s, v.to_s].sort, v.to_s]
|
143
|
-
}]
|
144
|
-
|
145
|
-
def merge_type(type1, type2)
|
146
|
-
if type1 == type2
|
147
|
-
type1
|
148
|
-
elsif type1.nil? || type2.nil?
|
149
|
-
type1 || type2
|
150
|
-
else
|
151
|
-
TYPE_COALESCE[[type1, type2].sort] || "string"
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
class TimestampMatch < String
|
156
|
-
def initialize(text)
|
157
|
-
super("timestamp")
|
158
|
-
@text = text
|
159
|
-
end
|
160
|
-
attr_reader :text
|
161
|
-
end
|
162
|
-
|
163
|
-
def guess_type(str)
|
164
|
-
if TRUE_STRINGS[str]
|
165
|
-
return "boolean"
|
166
|
-
end
|
167
|
-
|
168
|
-
if TimeFormatGuess.guess(str)
|
169
|
-
return TimestampMatch.new(str)
|
170
|
-
end
|
171
|
-
|
172
|
-
if str.to_i.to_s == str
|
173
|
-
return "long"
|
174
|
-
end
|
175
|
-
|
176
|
-
if str.include?('.')
|
177
|
-
a, b = str.split(".", 2)
|
178
|
-
if a.to_i.to_s == a && b.to_i.to_s == b
|
179
|
-
return "double"
|
180
|
-
end
|
181
|
-
end
|
182
|
-
|
183
|
-
return "string"
|
184
|
-
end
|
185
|
-
|
186
|
-
def array_sum(array)
|
187
|
-
array.inject(0) {|r,i| r += i }
|
188
|
-
end
|
189
|
-
|
190
|
-
def array_avg(array)
|
191
|
-
array.inject(0.0) {|r,i| r += i } / array.size
|
192
|
-
end
|
193
|
-
|
194
|
-
def array_variance(array)
|
195
|
-
avg = array_avg(array)
|
196
|
-
array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
|
197
|
-
end
|
198
|
-
|
199
|
-
def array_standard_deviation(array)
|
200
|
-
Math.sqrt(array_variance(array))
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
end
|