embulk 0.4.1 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,16 +0,0 @@
1
- module Embulk
2
-
3
- class GzipGuess < GuessPlugin
4
- Plugin.register_guess('gzip', self)
5
-
6
- GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
7
-
8
- def guess(config, sample_buffer)
9
- if sample_buffer[0,2] == GZIP_HEADER
10
- return {"decoders" => [{"type" => "gzip"}]}
11
- end
12
- return {}
13
- end
14
- end
15
-
16
- end
@@ -1,20 +0,0 @@
1
- module Embulk
2
-
3
- class GuessNewline < TextGuessPlugin
4
- Plugin.register_guess('newline', self)
5
-
6
- def guess_text(config, sample_text)
7
- cr_count = sample_text.count("\r")
8
- lf_count = sample_text.count("\n")
9
- crlf_count = sample_text.scan(/\r\n/).length
10
- if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
11
- return {"parser" => {"newline" => "CRLF"}}
12
- elsif cr_count > lf_count / 2
13
- return {"parser" => {"newline" => "CR"}}
14
- else
15
- return {"parser" => {"newline" => "LF"}}
16
- end
17
- end
18
- end
19
-
20
- end
@@ -1,331 +0,0 @@
1
- module Embulk::TimeFormatGuess
2
- module Parts
3
- YEAR = /[1-4][0-9]{3}/
4
- MONTH = /10|11|12|[0 ]?[0-9]/
5
- MONTH_NODELIM = /10|11|12|[0][0-9]/
6
- DAY = /[1-2][0-9]|[0 ]?[1-9]|30|31/
7
- DAY_NODELIM = /[1-2][0-9]|[0][1-9]|30|31/
8
- HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/
9
- HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/
10
- MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/
11
- MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/
12
-
13
- MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/
14
- MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/
15
-
16
- WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/
17
- WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/
18
- end
19
-
20
- class GuessMatch
21
- def initialize(delimiters, parts, part_options)
22
- @delimiters = delimiters
23
- @parts = parts
24
- @part_options = part_options
25
- end
26
-
27
- def format
28
- format = ''
29
- @parts.size.times do |i|
30
- format << @delimiters[i-1] if i != 0
31
- option = @part_options[i]
32
-
33
- case @parts[i]
34
- when :year
35
- format << '%Y'
36
-
37
- when :month
38
- case option
39
- when :zero
40
- format << '%m'
41
- when :blank
42
- #format << '%_m' # not supported
43
- format << '%m'
44
- when :none
45
- #format << '%-m' # not supported
46
- format << '%m'
47
- else
48
- format << '%m'
49
- end
50
-
51
- when :day
52
- case option
53
- when :zero
54
- format << '%d'
55
- when :blank
56
- format << '%e'
57
- when :none
58
- format << '%d' # not supported
59
- else
60
- format << '%d'
61
- end
62
-
63
- when :hour
64
- case option
65
- when :zero
66
- format << '%H'
67
- when :blank
68
- format << '%k'
69
- when :none
70
- format << '%k' # not supported
71
- else
72
- format << '%H'
73
- end
74
-
75
- when :minute
76
- # heading options are not supported
77
- format << '%M'
78
-
79
- when :second
80
- # heading options are not supported
81
- format << '%S'
82
-
83
- when :frac
84
- if option <= 3
85
- format << '%L'
86
- #elsif option <= 6
87
- # format << '%6N'
88
- #elsif option <= 6
89
- # format << '%6N'
90
- #elsif option <= 9
91
- # format << '%9N'
92
- #elsif option <= 12
93
- # format << '%12N'
94
- #elsif option <= 15
95
- # format << '%15N'
96
- #elsif option <= 18
97
- # format << '%18N'
98
- #elsif option <= 21
99
- # format << '%21N'
100
- #elsif option <= 24
101
- # format << '%24N'
102
- else
103
- format << '%N'
104
- end
105
-
106
- when :zone_off
107
- format << '%z'
108
-
109
- when :zone_abb
110
- format << '%Z'
111
-
112
- else
113
- raise "Unknown part: #{@parts[i]}"
114
- end
115
- end
116
-
117
- return format
118
- end
119
-
120
- def mergeable_group
121
- [@delimiters, @parts]
122
- end
123
-
124
- attr_reader :part_options
125
-
126
- def merge!(another_in_group)
127
- part_options = another_in_group.part_options
128
- @part_options.size.times do |i|
129
- @part_options[i] ||= part_options[i]
130
- if @part_options[i] == nil
131
- part_options[i]
132
- elsif part_options[i] == nil
133
- @part_options[i]
134
- else
135
- [@part_options[i], part_options[i]].sort.last
136
- end
137
- end
138
- end
139
- end
140
-
141
- class GuessPattern
142
- include Parts
143
-
144
- date_delims = /[\/\-]/
145
- # yyyy-MM-dd
146
- YMD = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
147
- YMD_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
148
- # dd/MM/yyyy
149
- DMY = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
150
- DMY_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
151
-
152
- frac = /[0-9]{1,24}/
153
- time_delims = /[\:\-]/
154
- frac_delims = /[\.\,]/
155
- TIME = /(?<hour>#{HOUR})(?<time_delim>#{time_delims})(?<minute>#{MINUTE})(?:\k<time_delim>(?<second>#{SECOND})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
156
- TIME_NODELIM = /(?<hour>#{HOUR_NODELIM})(?<minute>#{MINUTE_NODELIM})((?<second>#{SECOND_NODELIM})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
157
-
158
- TZ = /(?<zone_space> )?(?<zone>(?<zone_off>[\-\+]\d\d(?::?\d\d)?)|(?<zone_abb>[A-Z]{3}))|(?<z>Z)/
159
-
160
- def match(text)
161
- delimiters = []
162
- parts = []
163
- part_options = []
164
-
165
- if dm = (/^#{YMD}(?<rest>.*?)$/.match(text) or /^#{YMD_NODELIM}(?<rest>.*?)$/.match(text))
166
- date_delim = dm["date_delim"] rescue ""
167
-
168
- parts << :year
169
- part_options << nil
170
- delimiters << date_delim
171
-
172
- parts << :month
173
- part_options << part_heading_option(dm["month"])
174
- delimiters << date_delim
175
-
176
- parts << :day
177
- part_options << part_heading_option(dm["day"])
178
-
179
- elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
180
- date_delim = dm["date_delim"] rescue ""
181
-
182
- parts << :day
183
- part_options << part_heading_option(dm["day"])
184
- delimiters << date_delim
185
-
186
- parts << :month
187
- part_options << part_heading_option(dm["month"])
188
- delimiters << date_delim
189
-
190
- parts << :year
191
- part_options << nil
192
- delimiters << date_delim
193
-
194
- else
195
- date_delim = ""
196
- return nil
197
- end
198
- rest = dm["rest"]
199
-
200
- date_time_delims = /[ _T]/
201
- if tm = (
202
- /^(?<date_time_delim>#{date_time_delims})#{TIME}(?<rest>.*?)?$/.match(rest) or
203
- /^(?<date_time_delim>#{date_time_delims})#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest) or
204
- (date_delim == "" && /^#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest))
205
- )
206
- date_time_delim = tm["date_time_delim"] rescue ""
207
- time_delim = tm["time_delim"] rescue ""
208
-
209
- delimiters << date_time_delim
210
- parts << :hour
211
- part_options << part_heading_option(tm["hour"])
212
-
213
- delimiters << time_delim
214
- parts << :minute
215
- part_options << part_heading_option(tm["minute"])
216
-
217
- if tm["second"]
218
- delimiters << time_delim
219
- parts << :second
220
- part_options << part_heading_option(tm["second"])
221
- end
222
-
223
- if tm["frac"]
224
- delimiters << tm["frac_delim"]
225
- parts << :frac
226
- part_options << tm["frac"].size
227
- end
228
-
229
- rest = tm["rest"]
230
- end
231
-
232
- if zm = /^#{TZ}$/.match(rest)
233
- delimiters << zm["zone_space"] || ''
234
- if zm["z"]
235
- # TODO ISO 8601
236
- parts << :zone_off
237
- elsif zm["zone_off"]
238
- parts << :zone_off
239
- else
240
- parts << :zone_abb
241
- end
242
- part_options << nil
243
-
244
- return GuessMatch.new(delimiters, parts, part_options)
245
-
246
- elsif rest =~ /^\s*$/
247
- return GuessMatch.new(delimiters, parts, part_options)
248
-
249
- else
250
- return nil
251
- end
252
- end
253
-
254
- def part_heading_option(text)
255
- if text[0] == '0'
256
- :zero
257
- elsif text[0] == ' '
258
- :blank
259
- elsif text.size == 1
260
- :none
261
- else
262
- nil
263
- end
264
- end
265
- end
266
-
267
- class RegexpMatch
268
- def initialize(format)
269
- @format
270
- end
271
-
272
- attr_reader :format
273
-
274
- def mergeable_group
275
- @format
276
- end
277
-
278
- def merge!(another_in_group)
279
- end
280
- end
281
-
282
- class RegexpPattern
283
- def initialize(regexp, format)
284
- @regexp = regexp
285
- @match = RegexpMatch.new(format)
286
- end
287
-
288
- def match(text)
289
- if @regexp =~ text
290
- return @match
291
- else
292
- return nil
293
- end
294
- end
295
- end
296
-
297
- module StandardPatterns
298
- include Parts
299
-
300
- RFC_822_1123 = /^#{WEEKDAY_NAME_SHORT}, \d\d #{MONTH_NAME_SHORT} \d\d\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
301
- RFC_850_1035 = /^#{WEEKDAY_NAME_FULL}, \d\d-#{MONTH_NAME_SHORT}-\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
302
- APACHE_CLF = /^\d\d\/#{MONTH_NAME_SHORT}\/\d\d\d\d \d\d:\d\d:\d\d [\-\+]\d\d(?::?\d\d)?$/
303
- ANSI_C_ASCTIME = /^#{WEEKDAY_NAME_SHORT} #{MONTH_NAME_SHORT} \d\d? \d\d:\d\d:\d\d \d\d\d\d$/
304
- end
305
-
306
- PATTERNS = [
307
- GuessPattern.new,
308
- RegexpPattern.new(StandardPatterns::RFC_822_1123, "%a, %d %b %Y %H:%M:%S %z"),
309
- RegexpPattern.new(StandardPatterns::RFC_850_1035, "%A, %d-%b-%y %H:%M:%S %z"),
310
- RegexpPattern.new(StandardPatterns::APACHE_CLF, "%d/%b/%Y %H:%M:%S %Z"),
311
- RegexpPattern.new(StandardPatterns::ANSI_C_ASCTIME, "$a %b %e %H:%M:%S %Y"),
312
- ]
313
-
314
- def self.guess(texts)
315
- texts = Array(texts).select {|text| text != "" }
316
- matches = texts.map do |text|
317
- PATTERNS.map {|pattern| pattern.match(text) }.compact
318
- end.flatten
319
- if matches.empty?
320
- return nil
321
- elsif matches.size == 1
322
- return matches[0].format
323
- else
324
- match_groups = matches.group_by {|match| match.mergeable_group }
325
- best_match_group = match_groups.sort_by {|group| group.size }.last[1]
326
- best_match = best_match_group.shift
327
- best_match_group.each {|m| best_match.merge!(m) }
328
- return best_match.format
329
- end
330
- end
331
- end