embulk 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +0,0 @@
1
- module Embulk
2
-
3
- class GzipGuess < GuessPlugin
4
- Plugin.register_guess('gzip', self)
5
-
6
- GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
7
-
8
- def guess(config, sample_buffer)
9
- if sample_buffer[0,2] == GZIP_HEADER
10
- return {"decoders" => [{"type" => "gzip"}]}
11
- end
12
- return {}
13
- end
14
- end
15
-
16
- end
@@ -1,20 +0,0 @@
1
- module Embulk
2
-
3
- class GuessNewline < TextGuessPlugin
4
- Plugin.register_guess('newline', self)
5
-
6
- def guess_text(config, sample_text)
7
- cr_count = sample_text.count("\r")
8
- lf_count = sample_text.count("\n")
9
- crlf_count = sample_text.scan(/\r\n/).length
10
- if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
11
- return {"parser" => {"newline" => "CRLF"}}
12
- elsif cr_count > lf_count / 2
13
- return {"parser" => {"newline" => "CR"}}
14
- else
15
- return {"parser" => {"newline" => "LF"}}
16
- end
17
- end
18
- end
19
-
20
- end
@@ -1,331 +0,0 @@
1
- module Embulk::TimeFormatGuess
2
- module Parts
3
- YEAR = /[1-4][0-9]{3}/
4
- MONTH = /10|11|12|[0 ]?[0-9]/
5
- MONTH_NODELIM = /10|11|12|[0][0-9]/
6
- DAY = /[1-2][0-9]|[0 ]?[1-9]|30|31/
7
- DAY_NODELIM = /[1-2][0-9]|[0][1-9]|30|31/
8
- HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/
9
- HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/
10
- MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/
11
- MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/
12
-
13
- MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/
14
- MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/
15
-
16
- WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/
17
- WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/
18
- end
19
-
20
- class GuessMatch
21
- def initialize(delimiters, parts, part_options)
22
- @delimiters = delimiters
23
- @parts = parts
24
- @part_options = part_options
25
- end
26
-
27
- def format
28
- format = ''
29
- @parts.size.times do |i|
30
- format << @delimiters[i-1] if i != 0
31
- option = @part_options[i]
32
-
33
- case @parts[i]
34
- when :year
35
- format << '%Y'
36
-
37
- when :month
38
- case option
39
- when :zero
40
- format << '%m'
41
- when :blank
42
- #format << '%_m' # not supported
43
- format << '%m'
44
- when :none
45
- #format << '%-m' # not supported
46
- format << '%m'
47
- else
48
- format << '%m'
49
- end
50
-
51
- when :day
52
- case option
53
- when :zero
54
- format << '%d'
55
- when :blank
56
- format << '%e'
57
- when :none
58
- format << '%d' # not supported
59
- else
60
- format << '%d'
61
- end
62
-
63
- when :hour
64
- case option
65
- when :zero
66
- format << '%H'
67
- when :blank
68
- format << '%k'
69
- when :none
70
- format << '%k' # not supported
71
- else
72
- format << '%H'
73
- end
74
-
75
- when :minute
76
- # heading options are not supported
77
- format << '%M'
78
-
79
- when :second
80
- # heading options are not supported
81
- format << '%S'
82
-
83
- when :frac
84
- if option <= 3
85
- format << '%L'
86
- #elsif option <= 6
87
- # format << '%6N'
88
- #elsif option <= 6
89
- # format << '%6N'
90
- #elsif option <= 9
91
- # format << '%9N'
92
- #elsif option <= 12
93
- # format << '%12N'
94
- #elsif option <= 15
95
- # format << '%15N'
96
- #elsif option <= 18
97
- # format << '%18N'
98
- #elsif option <= 21
99
- # format << '%21N'
100
- #elsif option <= 24
101
- # format << '%24N'
102
- else
103
- format << '%N'
104
- end
105
-
106
- when :zone_off
107
- format << '%z'
108
-
109
- when :zone_abb
110
- format << '%Z'
111
-
112
- else
113
- raise "Unknown part: #{@parts[i]}"
114
- end
115
- end
116
-
117
- return format
118
- end
119
-
120
- def mergeable_group
121
- [@delimiters, @parts]
122
- end
123
-
124
- attr_reader :part_options
125
-
126
- def merge!(another_in_group)
127
- part_options = another_in_group.part_options
128
- @part_options.size.times do |i|
129
- @part_options[i] ||= part_options[i]
130
- if @part_options[i] == nil
131
- part_options[i]
132
- elsif part_options[i] == nil
133
- @part_options[i]
134
- else
135
- [@part_options[i], part_options[i]].sort.last
136
- end
137
- end
138
- end
139
- end
140
-
141
- class GuessPattern
142
- include Parts
143
-
144
- date_delims = /[\/\-]/
145
- # yyyy-MM-dd
146
- YMD = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
147
- YMD_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
148
- # dd/MM/yyyy
149
- DMY = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
150
- DMY_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
151
-
152
- frac = /[0-9]{1,24}/
153
- time_delims = /[\:\-]/
154
- frac_delims = /[\.\,]/
155
- TIME = /(?<hour>#{HOUR})(?<time_delim>#{time_delims})(?<minute>#{MINUTE})(?:\k<time_delim>(?<second>#{SECOND})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
156
- TIME_NODELIM = /(?<hour>#{HOUR_NODELIM})(?<minute>#{MINUTE_NODELIM})((?<second>#{SECOND_NODELIM})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
157
-
158
- TZ = /(?<zone_space> )?(?<zone>(?<zone_off>[\-\+]\d\d(?::?\d\d)?)|(?<zone_abb>[A-Z]{3}))|(?<z>Z)/
159
-
160
- def match(text)
161
- delimiters = []
162
- parts = []
163
- part_options = []
164
-
165
- if dm = (/^#{YMD}(?<rest>.*?)$/.match(text) or /^#{YMD_NODELIM}(?<rest>.*?)$/.match(text))
166
- date_delim = dm["date_delim"] rescue ""
167
-
168
- parts << :year
169
- part_options << nil
170
- delimiters << date_delim
171
-
172
- parts << :month
173
- part_options << part_heading_option(dm["month"])
174
- delimiters << date_delim
175
-
176
- parts << :day
177
- part_options << part_heading_option(dm["day"])
178
-
179
- elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
180
- date_delim = dm["date_delim"] rescue ""
181
-
182
- parts << :day
183
- part_options << part_heading_option(dm["day"])
184
- delimiters << date_delim
185
-
186
- parts << :month
187
- part_options << part_heading_option(dm["month"])
188
- delimiters << date_delim
189
-
190
- parts << :year
191
- part_options << nil
192
- delimiters << date_delim
193
-
194
- else
195
- date_delim = ""
196
- return nil
197
- end
198
- rest = dm["rest"]
199
-
200
- date_time_delims = /[ _T]/
201
- if tm = (
202
- /^(?<date_time_delim>#{date_time_delims})#{TIME}(?<rest>.*?)?$/.match(rest) or
203
- /^(?<date_time_delim>#{date_time_delims})#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest) or
204
- (date_delim == "" && /^#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest))
205
- )
206
- date_time_delim = tm["date_time_delim"] rescue ""
207
- time_delim = tm["time_delim"] rescue ""
208
-
209
- delimiters << date_time_delim
210
- parts << :hour
211
- part_options << part_heading_option(tm["hour"])
212
-
213
- delimiters << time_delim
214
- parts << :minute
215
- part_options << part_heading_option(tm["minute"])
216
-
217
- if tm["second"]
218
- delimiters << time_delim
219
- parts << :second
220
- part_options << part_heading_option(tm["second"])
221
- end
222
-
223
- if tm["frac"]
224
- delimiters << tm["frac_delim"]
225
- parts << :frac
226
- part_options << tm["frac"].size
227
- end
228
-
229
- rest = tm["rest"]
230
- end
231
-
232
- if zm = /^#{TZ}$/.match(rest)
233
- delimiters << zm["zone_space"] || ''
234
- if zm["z"]
235
- # TODO ISO 8601
236
- parts << :zone_off
237
- elsif zm["zone_off"]
238
- parts << :zone_off
239
- else
240
- parts << :zone_abb
241
- end
242
- part_options << nil
243
-
244
- return GuessMatch.new(delimiters, parts, part_options)
245
-
246
- elsif rest =~ /^\s*$/
247
- return GuessMatch.new(delimiters, parts, part_options)
248
-
249
- else
250
- return nil
251
- end
252
- end
253
-
254
- def part_heading_option(text)
255
- if text[0] == '0'
256
- :zero
257
- elsif text[0] == ' '
258
- :blank
259
- elsif text.size == 1
260
- :none
261
- else
262
- nil
263
- end
264
- end
265
- end
266
-
267
- class RegexpMatch
268
- def initialize(format)
269
- @format
270
- end
271
-
272
- attr_reader :format
273
-
274
- def mergeable_group
275
- @format
276
- end
277
-
278
- def merge!(another_in_group)
279
- end
280
- end
281
-
282
- class RegexpPattern
283
- def initialize(regexp, format)
284
- @regexp = regexp
285
- @match = RegexpMatch.new(format)
286
- end
287
-
288
- def match(text)
289
- if @regexp =~ text
290
- return @match
291
- else
292
- return nil
293
- end
294
- end
295
- end
296
-
297
- module StandardPatterns
298
- include Parts
299
-
300
- RFC_822_1123 = /^#{WEEKDAY_NAME_SHORT}, \d\d #{MONTH_NAME_SHORT} \d\d\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
301
- RFC_850_1035 = /^#{WEEKDAY_NAME_FULL}, \d\d-#{MONTH_NAME_SHORT}-\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
302
- APACHE_CLF = /^\d\d\/#{MONTH_NAME_SHORT}\/\d\d\d\d \d\d:\d\d:\d\d [\-\+]\d\d(?::?\d\d)?$/
303
- ANSI_C_ASCTIME = /^#{WEEKDAY_NAME_SHORT} #{MONTH_NAME_SHORT} \d\d? \d\d:\d\d:\d\d \d\d\d\d$/
304
- end
305
-
306
- PATTERNS = [
307
- GuessPattern.new,
308
- RegexpPattern.new(StandardPatterns::RFC_822_1123, "%a, %d %b %Y %H:%M:%S %z"),
309
- RegexpPattern.new(StandardPatterns::RFC_850_1035, "%A, %d-%b-%y %H:%M:%S %z"),
310
- RegexpPattern.new(StandardPatterns::APACHE_CLF, "%d/%b/%Y %H:%M:%S %Z"),
311
- RegexpPattern.new(StandardPatterns::ANSI_C_ASCTIME, "$a %b %e %H:%M:%S %Y"),
312
- ]
313
-
314
- def self.guess(texts)
315
- texts = Array(texts).select {|text| text != "" }
316
- matches = texts.map do |text|
317
- PATTERNS.map {|pattern| pattern.match(text) }.compact
318
- end.flatten
319
- if matches.empty?
320
- return nil
321
- elsif matches.size == 1
322
- return matches[0].format
323
- else
324
- match_groups = matches.group_by {|match| match.mergeable_group }
325
- best_match_group = match_groups.sort_by {|group| group.size }.last[1]
326
- best_match = best_match_group.shift
327
- best_match_group.each {|m| best_match.merge!(m) }
328
- return best_match.format
329
- end
330
- end
331
- end