embulk 0.4.1 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class NewlineGuessPlugin < TextGuessPlugin
5
+ Plugin.register_guess('newline', self)
6
+
7
+ def guess_text(config, sample_text)
8
+ cr_count = sample_text.count("\r")
9
+ lf_count = sample_text.count("\n")
10
+ crlf_count = sample_text.scan(/\r\n/).length
11
+ if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
12
+ return {"parser" => {"newline" => "CRLF"}}
13
+ elsif cr_count > lf_count / 2
14
+ return {"parser" => {"newline" => "CR"}}
15
+ else
16
+ return {"parser" => {"newline" => "LF"}}
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,333 @@
1
+ module Embulk::Guess
2
+ module TimeFormatGuess
3
+ module Parts
4
+ YEAR = /[1-4][0-9]{3}/
5
+ MONTH = /10|11|12|[0 ]?[0-9]/
6
+ MONTH_NODELIM = /10|11|12|[0][0-9]/
7
+ DAY = /[1-2][0-9]|[0 ]?[1-9]|30|31/
8
+ DAY_NODELIM = /[1-2][0-9]|[0][1-9]|30|31/
9
+ HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/
10
+ HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/
11
+ MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/
12
+ MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/
13
+
14
+ MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/
15
+ MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/
16
+
17
+ WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/
18
+ WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/
19
+ end
20
+
21
+ class GuessMatch
22
+ def initialize(delimiters, parts, part_options)
23
+ @delimiters = delimiters
24
+ @parts = parts
25
+ @part_options = part_options
26
+ end
27
+
28
+ def format
29
+ format = ''
30
+ @parts.size.times do |i|
31
+ format << @delimiters[i-1] if i != 0
32
+ option = @part_options[i]
33
+
34
+ case @parts[i]
35
+ when :year
36
+ format << '%Y'
37
+
38
+ when :month
39
+ case option
40
+ when :zero
41
+ format << '%m'
42
+ when :blank
43
+ #format << '%_m' # not supported
44
+ format << '%m'
45
+ when :none
46
+ #format << '%-m' # not supported
47
+ format << '%m'
48
+ else
49
+ format << '%m'
50
+ end
51
+
52
+ when :day
53
+ case option
54
+ when :zero
55
+ format << '%d'
56
+ when :blank
57
+ format << '%e'
58
+ when :none
59
+ format << '%d' # not supported
60
+ else
61
+ format << '%d'
62
+ end
63
+
64
+ when :hour
65
+ case option
66
+ when :zero
67
+ format << '%H'
68
+ when :blank
69
+ format << '%k'
70
+ when :none
71
+ format << '%k' # not supported
72
+ else
73
+ format << '%H'
74
+ end
75
+
76
+ when :minute
77
+ # heading options are not supported
78
+ format << '%M'
79
+
80
+ when :second
81
+ # heading options are not supported
82
+ format << '%S'
83
+
84
+ when :frac
85
+ if option <= 3
86
+ format << '%L'
87
+ #elsif option <= 6
88
+ # format << '%6N'
89
+ #elsif option <= 6
90
+ # format << '%6N'
91
+ #elsif option <= 9
92
+ # format << '%9N'
93
+ #elsif option <= 12
94
+ # format << '%12N'
95
+ #elsif option <= 15
96
+ # format << '%15N'
97
+ #elsif option <= 18
98
+ # format << '%18N'
99
+ #elsif option <= 21
100
+ # format << '%21N'
101
+ #elsif option <= 24
102
+ # format << '%24N'
103
+ else
104
+ format << '%N'
105
+ end
106
+
107
+ when :zone_off
108
+ format << '%z'
109
+
110
+ when :zone_abb
111
+ format << '%Z'
112
+
113
+ else
114
+ raise "Unknown part: #{@parts[i]}"
115
+ end
116
+ end
117
+
118
+ return format
119
+ end
120
+
121
+ def mergeable_group
122
+ [@delimiters, @parts]
123
+ end
124
+
125
+ attr_reader :part_options
126
+
127
+ def merge!(another_in_group)
128
+ part_options = another_in_group.part_options
129
+ @part_options.size.times do |i|
130
+ @part_options[i] ||= part_options[i]
131
+ if @part_options[i] == nil
132
+ part_options[i]
133
+ elsif part_options[i] == nil
134
+ @part_options[i]
135
+ else
136
+ [@part_options[i], part_options[i]].sort.last
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ class GuessPattern
143
+ include Parts
144
+
145
+ date_delims = /[\/\-]/
146
+ # yyyy-MM-dd
147
+ YMD = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
148
+ YMD_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
149
+ # dd/MM/yyyy
150
+ DMY = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
151
+ DMY_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
152
+
153
+ frac = /[0-9]{1,24}/
154
+ time_delims = /[\:\-]/
155
+ frac_delims = /[\.\,]/
156
+ TIME = /(?<hour>#{HOUR})(?<time_delim>#{time_delims})(?<minute>#{MINUTE})(?:\k<time_delim>(?<second>#{SECOND})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
157
+ TIME_NODELIM = /(?<hour>#{HOUR_NODELIM})(?<minute>#{MINUTE_NODELIM})((?<second>#{SECOND_NODELIM})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
158
+
159
+ TZ = /(?<zone_space> )?(?<zone>(?<zone_off>[\-\+]\d\d(?::?\d\d)?)|(?<zone_abb>[A-Z]{3}))|(?<z>Z)/
160
+
161
+ def match(text)
162
+ delimiters = []
163
+ parts = []
164
+ part_options = []
165
+
166
+ if dm = (/^#{YMD}(?<rest>.*?)$/.match(text) or /^#{YMD_NODELIM}(?<rest>.*?)$/.match(text))
167
+ date_delim = dm["date_delim"] rescue ""
168
+
169
+ parts << :year
170
+ part_options << nil
171
+ delimiters << date_delim
172
+
173
+ parts << :month
174
+ part_options << part_heading_option(dm["month"])
175
+ delimiters << date_delim
176
+
177
+ parts << :day
178
+ part_options << part_heading_option(dm["day"])
179
+
180
+ elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
181
+ date_delim = dm["date_delim"] rescue ""
182
+
183
+ parts << :day
184
+ part_options << part_heading_option(dm["day"])
185
+ delimiters << date_delim
186
+
187
+ parts << :month
188
+ part_options << part_heading_option(dm["month"])
189
+ delimiters << date_delim
190
+
191
+ parts << :year
192
+ part_options << nil
193
+ delimiters << date_delim
194
+
195
+ else
196
+ date_delim = ""
197
+ return nil
198
+ end
199
+ rest = dm["rest"]
200
+
201
+ date_time_delims = /[ _T]/
202
+ if tm = (
203
+ /^(?<date_time_delim>#{date_time_delims})#{TIME}(?<rest>.*?)?$/.match(rest) or
204
+ /^(?<date_time_delim>#{date_time_delims})#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest) or
205
+ (date_delim == "" && /^#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest))
206
+ )
207
+ date_time_delim = tm["date_time_delim"] rescue ""
208
+ time_delim = tm["time_delim"] rescue ""
209
+
210
+ delimiters << date_time_delim
211
+ parts << :hour
212
+ part_options << part_heading_option(tm["hour"])
213
+
214
+ delimiters << time_delim
215
+ parts << :minute
216
+ part_options << part_heading_option(tm["minute"])
217
+
218
+ if tm["second"]
219
+ delimiters << time_delim
220
+ parts << :second
221
+ part_options << part_heading_option(tm["second"])
222
+ end
223
+
224
+ if tm["frac"]
225
+ delimiters << tm["frac_delim"]
226
+ parts << :frac
227
+ part_options << tm["frac"].size
228
+ end
229
+
230
+ rest = tm["rest"]
231
+ end
232
+
233
+ if zm = /^#{TZ}$/.match(rest)
234
+ delimiters << zm["zone_space"] || ''
235
+ if zm["z"]
236
+ # TODO ISO 8601
237
+ parts << :zone_off
238
+ elsif zm["zone_off"]
239
+ parts << :zone_off
240
+ else
241
+ parts << :zone_abb
242
+ end
243
+ part_options << nil
244
+
245
+ return GuessMatch.new(delimiters, parts, part_options)
246
+
247
+ elsif rest =~ /^\s*$/
248
+ return GuessMatch.new(delimiters, parts, part_options)
249
+
250
+ else
251
+ return nil
252
+ end
253
+ end
254
+
255
+ def part_heading_option(text)
256
+ if text[0] == '0'
257
+ :zero
258
+ elsif text[0] == ' '
259
+ :blank
260
+ elsif text.size == 1
261
+ :none
262
+ else
263
+ nil
264
+ end
265
+ end
266
+ end
267
+
268
+ class RegexpMatch
269
+ def initialize(format)
270
+ @format
271
+ end
272
+
273
+ attr_reader :format
274
+
275
+ def mergeable_group
276
+ @format
277
+ end
278
+
279
+ def merge!(another_in_group)
280
+ end
281
+ end
282
+
283
+ class RegexpPattern
284
+ def initialize(regexp, format)
285
+ @regexp = regexp
286
+ @match = RegexpMatch.new(format)
287
+ end
288
+
289
+ def match(text)
290
+ if @regexp =~ text
291
+ return @match
292
+ else
293
+ return nil
294
+ end
295
+ end
296
+ end
297
+
298
+ module StandardPatterns
299
+ include Parts
300
+
301
+ RFC_822_1123 = /^#{WEEKDAY_NAME_SHORT}, \d\d #{MONTH_NAME_SHORT} \d\d\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
302
+ RFC_850_1035 = /^#{WEEKDAY_NAME_FULL}, \d\d-#{MONTH_NAME_SHORT}-\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
303
+ APACHE_CLF = /^\d\d\/#{MONTH_NAME_SHORT}\/\d\d\d\d \d\d:\d\d:\d\d [\-\+]\d\d(?::?\d\d)?$/
304
+ ANSI_C_ASCTIME = /^#{WEEKDAY_NAME_SHORT} #{MONTH_NAME_SHORT} \d\d? \d\d:\d\d:\d\d \d\d\d\d$/
305
+ end
306
+
307
+ PATTERNS = [
308
+ GuessPattern.new,
309
+ RegexpPattern.new(StandardPatterns::RFC_822_1123, "%a, %d %b %Y %H:%M:%S %z"),
310
+ RegexpPattern.new(StandardPatterns::RFC_850_1035, "%A, %d-%b-%y %H:%M:%S %z"),
311
+ RegexpPattern.new(StandardPatterns::APACHE_CLF, "%d/%b/%Y %H:%M:%S %Z"),
312
+ RegexpPattern.new(StandardPatterns::ANSI_C_ASCTIME, "$a %b %e %H:%M:%S %Y"),
313
+ ]
314
+
315
+ def self.guess(texts)
316
+ texts = Array(texts).select {|text| text != "" }
317
+ matches = texts.map do |text|
318
+ PATTERNS.map {|pattern| pattern.match(text) }.compact
319
+ end.flatten
320
+ if matches.empty?
321
+ return nil
322
+ elsif matches.size == 1
323
+ return matches[0].format
324
+ else
325
+ match_groups = matches.group_by {|match| match.mergeable_group }
326
+ best_match_group = match_groups.sort_by {|group| group.size }.last[1]
327
+ best_match = best_match_group.shift
328
+ best_match_group.each {|m| best_match.merge!(m) }
329
+ return best_match.format
330
+ end
331
+ end
332
+ end
333
+ end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
@@ -267,6 +267,7 @@ files:
267
267
  - embulk-docs/src/release/release-0.3.2.rst
268
268
  - embulk-docs/src/release/release-0.4.0.rst
269
269
  - embulk-docs/src/release/release-0.4.1.rst
270
+ - embulk-docs/src/release/release-0.4.2.rst
270
271
  - embulk-standards/build.gradle
271
272
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
272
273
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -335,10 +336,11 @@ files:
335
336
  - lib/embulk/file_output_plugin.rb
336
337
  - lib/embulk/filter_plugin.rb
337
338
  - lib/embulk/formatter_plugin.rb
338
- - lib/embulk/guess_charset.rb
339
- - lib/embulk/guess_csv.rb
340
- - lib/embulk/guess_gzip.rb
341
- - lib/embulk/guess_newline.rb
339
+ - lib/embulk/guess/charset.rb
340
+ - lib/embulk/guess/csv.rb
341
+ - lib/embulk/guess/gzip.rb
342
+ - lib/embulk/guess/newline.rb
343
+ - lib/embulk/guess/time_format_guess.rb
342
344
  - lib/embulk/guess_plugin.rb
343
345
  - lib/embulk/input_plugin.rb
344
346
  - lib/embulk/java/bootstrap.rb
@@ -352,7 +354,6 @@ files:
352
354
  - lib/embulk/plugin.rb
353
355
  - lib/embulk/plugin_registry.rb
354
356
  - lib/embulk/schema.rb
355
- - lib/embulk/time_format_guess.rb
356
357
  - lib/embulk/version.rb
357
358
  - settings.gradle
358
359
  - classpath/annotations-3.0.0.jar
@@ -361,8 +362,8 @@ files:
361
362
  - classpath/bval-jsr303-0.5.jar
362
363
  - classpath/commons-beanutils-core-1.8.3.jar
363
364
  - classpath/commons-lang3-3.1.jar
364
- - classpath/embulk-core-0.4.1.jar
365
- - classpath/embulk-standards-0.4.1.jar
365
+ - classpath/embulk-core-0.4.2.jar
366
+ - classpath/embulk-standards-0.4.2.jar
366
367
  - classpath/guava-18.0.jar
367
368
  - classpath/guice-3.0.jar
368
369
  - classpath/guice-multibindings-3.0.jar
@@ -1,26 +0,0 @@
1
- module Embulk
2
-
3
- class GuessCharset < GuessPlugin
4
- Plugin.register_guess('charset', self)
5
-
6
- def guess(config, sample_buffer)
7
- # ICU4J
8
- detector = com.ibm.icu.text.CharsetDetector.new
9
- detector.setText(sample_buffer.to_java_bytes)
10
- best_match = detector.detect
11
- if best_match.getConfidence < 50
12
- name = "UTF-8"
13
- else
14
- name = best_match.getName
15
- if name == "ISO-8859-1"
16
- # ISO-8859-1 means ASCII which is a subset
17
- # of UTF-8 in most of cases due to lack of
18
- # sample data set
19
- name = "UTF-8"
20
- end
21
- end
22
- return {"parser" => {"charset" => name}}
23
- end
24
- end
25
-
26
- end
@@ -1,204 +0,0 @@
1
- module Embulk
2
- require_relative 'time_format_guess'
3
-
4
- class GuessCsv < LineGuessPlugin
5
- Plugin.register_guess('csv', self)
6
-
7
- DELIMITER_CANDIDATES = [
8
- ",", "\t", "|"
9
- ]
10
-
11
- QUOTE_CANDIDATES = [
12
- "\"", "'"
13
- ]
14
-
15
- # CsvParserPlugin.TRUE_STRINGS
16
- TRUE_STRINGS = Hash[*%w[
17
- true True TRUE
18
- yes Yes YES
19
- y Y
20
- on On ON
21
- 1
22
- ].map {|k| [k, true] }]
23
-
24
- def guess_lines(config, sample_lines)
25
- delim = guess_delimiter(sample_lines)
26
- unless delim
27
- # not CSV file
28
- return {}
29
- end
30
-
31
- parser_config = config["parser"] || {}
32
- parser_guessed = {"type" => "csv", "delimiter" => delim}
33
-
34
- quote = guess_quote(sample_lines, delim)
35
- parser_guessed["quote"] = quote ? quote : ''
36
-
37
- sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
38
- first_types = guess_field_types(sample_records[0, 1])
39
- other_types = guess_field_types(sample_records[1..-1])
40
-
41
- if first_types.size <= 1 || other_types.size <= 1
42
- # guess failed
43
- return {}
44
- end
45
-
46
- unless parser_config.has_key?("header_line")
47
- parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
48
- end
49
-
50
- unless parser_config.has_key?("columns")
51
- if parser_guessed["header_line"] || parser_config["header_line"]
52
- column_names = sample_records.first
53
- else
54
- column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
55
- end
56
- schema = []
57
- column_names.zip(other_types).each do |name,(type,format)|
58
- if name && type
59
- if format
60
- schema << {"name" => name, "type" => type, "format" => format}
61
- else
62
- schema << {"name" => name, "type" => type}
63
- end
64
- end
65
- end
66
- parser_guessed["columns"] = schema
67
- end
68
-
69
- return {"parser" => parser_guessed}
70
- end
71
-
72
- private
73
-
74
- def guess_delimiter(sample_lines)
75
- delim_weights = DELIMITER_CANDIDATES.map do |d|
76
- counts = sample_lines.map {|line| line.count(d) }
77
- total = array_sum(counts)
78
- if total > 0
79
- stddev = array_standard_deviation(counts)
80
- stddev = 0.000000001 if stddev == 0.0
81
- weight = total / stddev
82
- [d, weight]
83
- else
84
- [nil, 0]
85
- end
86
- end
87
-
88
- delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
89
- if delim != nil && weight > 1
90
- return delim
91
- else
92
- return nil
93
- end
94
- end
95
-
96
- def guess_quote(sample_lines, delim)
97
- delim_regexp = Regexp.escape(delim)
98
- quote_weights = QUOTE_CANDIDATES.map do |q|
99
- weights = sample_lines.map do |line|
100
- q_regexp = Regexp.escape(q)
101
- count = line.count(q)
102
- if count > 0
103
- weight = count
104
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
105
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
106
- weight
107
- else
108
- nil
109
- end
110
- end.compact
111
- weights.empty? ? 0 : array_avg(weights)
112
- end
113
- quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
114
- if weight >= 10.0
115
- return quote
116
- else
117
- return nil
118
- end
119
- end
120
-
121
- def guess_field_types(field_lines)
122
- column_lines = []
123
- field_lines.each do |fields|
124
- fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
125
- end
126
- columns = column_lines.map do |types|
127
- t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
128
- if t.is_a?(TimestampMatch)
129
- format = TimeFormatGuess.guess(types.map {|type| type.text })
130
- ["timestamp", format]
131
- else
132
- [t]
133
- end
134
- end
135
- return columns
136
- end
137
-
138
- TYPE_COALESCE = Hash[{
139
- long: :double,
140
- boolean: :long,
141
- }.map {|k,v|
142
- [[k.to_s, v.to_s].sort, v.to_s]
143
- }]
144
-
145
- def merge_type(type1, type2)
146
- if type1 == type2
147
- type1
148
- elsif type1.nil? || type2.nil?
149
- type1 || type2
150
- else
151
- TYPE_COALESCE[[type1, type2].sort] || "string"
152
- end
153
- end
154
-
155
- class TimestampMatch < String
156
- def initialize(text)
157
- super("timestamp")
158
- @text = text
159
- end
160
- attr_reader :text
161
- end
162
-
163
- def guess_type(str)
164
- if TRUE_STRINGS[str]
165
- return "boolean"
166
- end
167
-
168
- if TimeFormatGuess.guess(str)
169
- return TimestampMatch.new(str)
170
- end
171
-
172
- if str.to_i.to_s == str
173
- return "long"
174
- end
175
-
176
- if str.include?('.')
177
- a, b = str.split(".", 2)
178
- if a.to_i.to_s == a && b.to_i.to_s == b
179
- return "double"
180
- end
181
- end
182
-
183
- return "string"
184
- end
185
-
186
- def array_sum(array)
187
- array.inject(0) {|r,i| r += i }
188
- end
189
-
190
- def array_avg(array)
191
- array.inject(0.0) {|r,i| r += i } / array.size
192
- end
193
-
194
- def array_variance(array)
195
- avg = array_avg(array)
196
- array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
197
- end
198
-
199
- def array_standard_deviation(array)
200
- Math.sqrt(array_variance(array))
201
- end
202
- end
203
-
204
- end