embulk 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/build.gradle +8 -2
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +5 -8
- data/embulk-docs/push-gh-pages.sh +1 -1
- data/embulk-docs/src/release.rst +2 -0
- data/embulk-docs/src/release/release-0.4.1.rst +2 -2
- data/embulk-docs/src/release/release-0.4.2.rst +18 -0
- data/lib/embulk/command/embulk_new_plugin.rb +9 -9
- data/lib/embulk/data/new/java/plugin_loader.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/filter.rb.erb +1 -1
- data/lib/embulk/data/new/ruby/gemspec.erb +2 -2
- data/lib/embulk/data/new/ruby/input.rb.erb +1 -1
- data/lib/embulk/guess/charset.rb +28 -0
- data/lib/embulk/guess/csv.rb +206 -0
- data/lib/embulk/guess/gzip.rb +18 -0
- data/lib/embulk/guess/newline.rb +22 -0
- data/lib/embulk/guess/time_format_guess.rb +333 -0
- data/lib/embulk/version.rb +1 -1
- metadata +9 -8
- data/lib/embulk/guess_charset.rb +0 -26
- data/lib/embulk/guess_csv.rb +0 -204
- data/lib/embulk/guess_gzip.rb +0 -16
- data/lib/embulk/guess_newline.rb +0 -20
- data/lib/embulk/time_format_guess.rb +0 -331
data/lib/embulk/guess_gzip.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
module Embulk
|
2
|
-
|
3
|
-
class GzipGuess < GuessPlugin
|
4
|
-
Plugin.register_guess('gzip', self)
|
5
|
-
|
6
|
-
GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
|
7
|
-
|
8
|
-
def guess(config, sample_buffer)
|
9
|
-
if sample_buffer[0,2] == GZIP_HEADER
|
10
|
-
return {"decoders" => [{"type" => "gzip"}]}
|
11
|
-
end
|
12
|
-
return {}
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
data/lib/embulk/guess_newline.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
module Embulk
|
2
|
-
|
3
|
-
class GuessNewline < TextGuessPlugin
|
4
|
-
Plugin.register_guess('newline', self)
|
5
|
-
|
6
|
-
def guess_text(config, sample_text)
|
7
|
-
cr_count = sample_text.count("\r")
|
8
|
-
lf_count = sample_text.count("\n")
|
9
|
-
crlf_count = sample_text.scan(/\r\n/).length
|
10
|
-
if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
|
11
|
-
return {"parser" => {"newline" => "CRLF"}}
|
12
|
-
elsif cr_count > lf_count / 2
|
13
|
-
return {"parser" => {"newline" => "CR"}}
|
14
|
-
else
|
15
|
-
return {"parser" => {"newline" => "LF"}}
|
16
|
-
end
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
@@ -1,331 +0,0 @@
|
|
1
|
-
module Embulk::TimeFormatGuess
|
2
|
-
module Parts
|
3
|
-
YEAR = /[1-4][0-9]{3}/
|
4
|
-
MONTH = /10|11|12|[0 ]?[0-9]/
|
5
|
-
MONTH_NODELIM = /10|11|12|[0][0-9]/
|
6
|
-
DAY = /[1-2][0-9]|[0 ]?[1-9]|30|31/
|
7
|
-
DAY_NODELIM = /[1-2][0-9]|[0][1-9]|30|31/
|
8
|
-
HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/
|
9
|
-
HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/
|
10
|
-
MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/
|
11
|
-
MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/
|
12
|
-
|
13
|
-
MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/
|
14
|
-
MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/
|
15
|
-
|
16
|
-
WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/
|
17
|
-
WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/
|
18
|
-
end
|
19
|
-
|
20
|
-
class GuessMatch
|
21
|
-
def initialize(delimiters, parts, part_options)
|
22
|
-
@delimiters = delimiters
|
23
|
-
@parts = parts
|
24
|
-
@part_options = part_options
|
25
|
-
end
|
26
|
-
|
27
|
-
def format
|
28
|
-
format = ''
|
29
|
-
@parts.size.times do |i|
|
30
|
-
format << @delimiters[i-1] if i != 0
|
31
|
-
option = @part_options[i]
|
32
|
-
|
33
|
-
case @parts[i]
|
34
|
-
when :year
|
35
|
-
format << '%Y'
|
36
|
-
|
37
|
-
when :month
|
38
|
-
case option
|
39
|
-
when :zero
|
40
|
-
format << '%m'
|
41
|
-
when :blank
|
42
|
-
#format << '%_m' # not supported
|
43
|
-
format << '%m'
|
44
|
-
when :none
|
45
|
-
#format << '%-m' # not supported
|
46
|
-
format << '%m'
|
47
|
-
else
|
48
|
-
format << '%m'
|
49
|
-
end
|
50
|
-
|
51
|
-
when :day
|
52
|
-
case option
|
53
|
-
when :zero
|
54
|
-
format << '%d'
|
55
|
-
when :blank
|
56
|
-
format << '%e'
|
57
|
-
when :none
|
58
|
-
format << '%d' # not supported
|
59
|
-
else
|
60
|
-
format << '%d'
|
61
|
-
end
|
62
|
-
|
63
|
-
when :hour
|
64
|
-
case option
|
65
|
-
when :zero
|
66
|
-
format << '%H'
|
67
|
-
when :blank
|
68
|
-
format << '%k'
|
69
|
-
when :none
|
70
|
-
format << '%k' # not supported
|
71
|
-
else
|
72
|
-
format << '%H'
|
73
|
-
end
|
74
|
-
|
75
|
-
when :minute
|
76
|
-
# heading options are not supported
|
77
|
-
format << '%M'
|
78
|
-
|
79
|
-
when :second
|
80
|
-
# heading options are not supported
|
81
|
-
format << '%S'
|
82
|
-
|
83
|
-
when :frac
|
84
|
-
if option <= 3
|
85
|
-
format << '%L'
|
86
|
-
#elsif option <= 6
|
87
|
-
# format << '%6N'
|
88
|
-
#elsif option <= 6
|
89
|
-
# format << '%6N'
|
90
|
-
#elsif option <= 9
|
91
|
-
# format << '%9N'
|
92
|
-
#elsif option <= 12
|
93
|
-
# format << '%12N'
|
94
|
-
#elsif option <= 15
|
95
|
-
# format << '%15N'
|
96
|
-
#elsif option <= 18
|
97
|
-
# format << '%18N'
|
98
|
-
#elsif option <= 21
|
99
|
-
# format << '%21N'
|
100
|
-
#elsif option <= 24
|
101
|
-
# format << '%24N'
|
102
|
-
else
|
103
|
-
format << '%N'
|
104
|
-
end
|
105
|
-
|
106
|
-
when :zone_off
|
107
|
-
format << '%z'
|
108
|
-
|
109
|
-
when :zone_abb
|
110
|
-
format << '%Z'
|
111
|
-
|
112
|
-
else
|
113
|
-
raise "Unknown part: #{@parts[i]}"
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
return format
|
118
|
-
end
|
119
|
-
|
120
|
-
def mergeable_group
|
121
|
-
[@delimiters, @parts]
|
122
|
-
end
|
123
|
-
|
124
|
-
attr_reader :part_options
|
125
|
-
|
126
|
-
def merge!(another_in_group)
|
127
|
-
part_options = another_in_group.part_options
|
128
|
-
@part_options.size.times do |i|
|
129
|
-
@part_options[i] ||= part_options[i]
|
130
|
-
if @part_options[i] == nil
|
131
|
-
part_options[i]
|
132
|
-
elsif part_options[i] == nil
|
133
|
-
@part_options[i]
|
134
|
-
else
|
135
|
-
[@part_options[i], part_options[i]].sort.last
|
136
|
-
end
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
class GuessPattern
|
142
|
-
include Parts
|
143
|
-
|
144
|
-
date_delims = /[\/\-]/
|
145
|
-
# yyyy-MM-dd
|
146
|
-
YMD = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
|
147
|
-
YMD_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
|
148
|
-
# dd/MM/yyyy
|
149
|
-
DMY = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
|
150
|
-
DMY_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
|
151
|
-
|
152
|
-
frac = /[0-9]{1,24}/
|
153
|
-
time_delims = /[\:\-]/
|
154
|
-
frac_delims = /[\.\,]/
|
155
|
-
TIME = /(?<hour>#{HOUR})(?<time_delim>#{time_delims})(?<minute>#{MINUTE})(?:\k<time_delim>(?<second>#{SECOND})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
|
156
|
-
TIME_NODELIM = /(?<hour>#{HOUR_NODELIM})(?<minute>#{MINUTE_NODELIM})((?<second>#{SECOND_NODELIM})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
|
157
|
-
|
158
|
-
TZ = /(?<zone_space> )?(?<zone>(?<zone_off>[\-\+]\d\d(?::?\d\d)?)|(?<zone_abb>[A-Z]{3}))|(?<z>Z)/
|
159
|
-
|
160
|
-
def match(text)
|
161
|
-
delimiters = []
|
162
|
-
parts = []
|
163
|
-
part_options = []
|
164
|
-
|
165
|
-
if dm = (/^#{YMD}(?<rest>.*?)$/.match(text) or /^#{YMD_NODELIM}(?<rest>.*?)$/.match(text))
|
166
|
-
date_delim = dm["date_delim"] rescue ""
|
167
|
-
|
168
|
-
parts << :year
|
169
|
-
part_options << nil
|
170
|
-
delimiters << date_delim
|
171
|
-
|
172
|
-
parts << :month
|
173
|
-
part_options << part_heading_option(dm["month"])
|
174
|
-
delimiters << date_delim
|
175
|
-
|
176
|
-
parts << :day
|
177
|
-
part_options << part_heading_option(dm["day"])
|
178
|
-
|
179
|
-
elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
|
180
|
-
date_delim = dm["date_delim"] rescue ""
|
181
|
-
|
182
|
-
parts << :day
|
183
|
-
part_options << part_heading_option(dm["day"])
|
184
|
-
delimiters << date_delim
|
185
|
-
|
186
|
-
parts << :month
|
187
|
-
part_options << part_heading_option(dm["month"])
|
188
|
-
delimiters << date_delim
|
189
|
-
|
190
|
-
parts << :year
|
191
|
-
part_options << nil
|
192
|
-
delimiters << date_delim
|
193
|
-
|
194
|
-
else
|
195
|
-
date_delim = ""
|
196
|
-
return nil
|
197
|
-
end
|
198
|
-
rest = dm["rest"]
|
199
|
-
|
200
|
-
date_time_delims = /[ _T]/
|
201
|
-
if tm = (
|
202
|
-
/^(?<date_time_delim>#{date_time_delims})#{TIME}(?<rest>.*?)?$/.match(rest) or
|
203
|
-
/^(?<date_time_delim>#{date_time_delims})#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest) or
|
204
|
-
(date_delim == "" && /^#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest))
|
205
|
-
)
|
206
|
-
date_time_delim = tm["date_time_delim"] rescue ""
|
207
|
-
time_delim = tm["time_delim"] rescue ""
|
208
|
-
|
209
|
-
delimiters << date_time_delim
|
210
|
-
parts << :hour
|
211
|
-
part_options << part_heading_option(tm["hour"])
|
212
|
-
|
213
|
-
delimiters << time_delim
|
214
|
-
parts << :minute
|
215
|
-
part_options << part_heading_option(tm["minute"])
|
216
|
-
|
217
|
-
if tm["second"]
|
218
|
-
delimiters << time_delim
|
219
|
-
parts << :second
|
220
|
-
part_options << part_heading_option(tm["second"])
|
221
|
-
end
|
222
|
-
|
223
|
-
if tm["frac"]
|
224
|
-
delimiters << tm["frac_delim"]
|
225
|
-
parts << :frac
|
226
|
-
part_options << tm["frac"].size
|
227
|
-
end
|
228
|
-
|
229
|
-
rest = tm["rest"]
|
230
|
-
end
|
231
|
-
|
232
|
-
if zm = /^#{TZ}$/.match(rest)
|
233
|
-
delimiters << zm["zone_space"] || ''
|
234
|
-
if zm["z"]
|
235
|
-
# TODO ISO 8601
|
236
|
-
parts << :zone_off
|
237
|
-
elsif zm["zone_off"]
|
238
|
-
parts << :zone_off
|
239
|
-
else
|
240
|
-
parts << :zone_abb
|
241
|
-
end
|
242
|
-
part_options << nil
|
243
|
-
|
244
|
-
return GuessMatch.new(delimiters, parts, part_options)
|
245
|
-
|
246
|
-
elsif rest =~ /^\s*$/
|
247
|
-
return GuessMatch.new(delimiters, parts, part_options)
|
248
|
-
|
249
|
-
else
|
250
|
-
return nil
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
254
|
-
def part_heading_option(text)
|
255
|
-
if text[0] == '0'
|
256
|
-
:zero
|
257
|
-
elsif text[0] == ' '
|
258
|
-
:blank
|
259
|
-
elsif text.size == 1
|
260
|
-
:none
|
261
|
-
else
|
262
|
-
nil
|
263
|
-
end
|
264
|
-
end
|
265
|
-
end
|
266
|
-
|
267
|
-
class RegexpMatch
|
268
|
-
def initialize(format)
|
269
|
-
@format
|
270
|
-
end
|
271
|
-
|
272
|
-
attr_reader :format
|
273
|
-
|
274
|
-
def mergeable_group
|
275
|
-
@format
|
276
|
-
end
|
277
|
-
|
278
|
-
def merge!(another_in_group)
|
279
|
-
end
|
280
|
-
end
|
281
|
-
|
282
|
-
class RegexpPattern
|
283
|
-
def initialize(regexp, format)
|
284
|
-
@regexp = regexp
|
285
|
-
@match = RegexpMatch.new(format)
|
286
|
-
end
|
287
|
-
|
288
|
-
def match(text)
|
289
|
-
if @regexp =~ text
|
290
|
-
return @match
|
291
|
-
else
|
292
|
-
return nil
|
293
|
-
end
|
294
|
-
end
|
295
|
-
end
|
296
|
-
|
297
|
-
module StandardPatterns
|
298
|
-
include Parts
|
299
|
-
|
300
|
-
RFC_822_1123 = /^#{WEEKDAY_NAME_SHORT}, \d\d #{MONTH_NAME_SHORT} \d\d\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
|
301
|
-
RFC_850_1035 = /^#{WEEKDAY_NAME_FULL}, \d\d-#{MONTH_NAME_SHORT}-\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
|
302
|
-
APACHE_CLF = /^\d\d\/#{MONTH_NAME_SHORT}\/\d\d\d\d \d\d:\d\d:\d\d [\-\+]\d\d(?::?\d\d)?$/
|
303
|
-
ANSI_C_ASCTIME = /^#{WEEKDAY_NAME_SHORT} #{MONTH_NAME_SHORT} \d\d? \d\d:\d\d:\d\d \d\d\d\d$/
|
304
|
-
end
|
305
|
-
|
306
|
-
PATTERNS = [
|
307
|
-
GuessPattern.new,
|
308
|
-
RegexpPattern.new(StandardPatterns::RFC_822_1123, "%a, %d %b %Y %H:%M:%S %z"),
|
309
|
-
RegexpPattern.new(StandardPatterns::RFC_850_1035, "%A, %d-%b-%y %H:%M:%S %z"),
|
310
|
-
RegexpPattern.new(StandardPatterns::APACHE_CLF, "%d/%b/%Y %H:%M:%S %Z"),
|
311
|
-
RegexpPattern.new(StandardPatterns::ANSI_C_ASCTIME, "$a %b %e %H:%M:%S %Y"),
|
312
|
-
]
|
313
|
-
|
314
|
-
def self.guess(texts)
|
315
|
-
texts = Array(texts).select {|text| text != "" }
|
316
|
-
matches = texts.map do |text|
|
317
|
-
PATTERNS.map {|pattern| pattern.match(text) }.compact
|
318
|
-
end.flatten
|
319
|
-
if matches.empty?
|
320
|
-
return nil
|
321
|
-
elsif matches.size == 1
|
322
|
-
return matches[0].format
|
323
|
-
else
|
324
|
-
match_groups = matches.group_by {|match| match.mergeable_group }
|
325
|
-
best_match_group = match_groups.sort_by {|group| group.size }.last[1]
|
326
|
-
best_match = best_match_group.shift
|
327
|
-
best_match_group.each {|m| best_match.merge!(m) }
|
328
|
-
return best_match.format
|
329
|
-
end
|
330
|
-
end
|
331
|
-
end
|