embulk 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class NewlineGuessPlugin < TextGuessPlugin
5
+ Plugin.register_guess('newline', self)
6
+
7
+ def guess_text(config, sample_text)
8
+ cr_count = sample_text.count("\r")
9
+ lf_count = sample_text.count("\n")
10
+ crlf_count = sample_text.scan(/\r\n/).length
11
+ if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
12
+ return {"parser" => {"newline" => "CRLF"}}
13
+ elsif cr_count > lf_count / 2
14
+ return {"parser" => {"newline" => "CR"}}
15
+ else
16
+ return {"parser" => {"newline" => "LF"}}
17
+ end
18
+ end
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,333 @@
1
+ module Embulk::Guess
2
+ module TimeFormatGuess
3
+ module Parts
4
+ YEAR = /[1-4][0-9]{3}/
5
+ MONTH = /10|11|12|[0 ]?[0-9]/
6
+ MONTH_NODELIM = /10|11|12|[0][0-9]/
7
+ DAY = /[1-2][0-9]|[0 ]?[1-9]|30|31/
8
+ DAY_NODELIM = /[1-2][0-9]|[0][1-9]|30|31/
9
+ HOUR = /20|21|22|23|24|1[0-9]|[0 ]?[0-9]/
10
+ HOUR_NODELIM = /20|21|22|23|24|1[0-9]|[0][0-9]/
11
+ MINUTE = SECOND = /60|[1-5][0-9]|[0 ]?[0-9]/
12
+ MINUTE_NODELIM = SECOND_NODELIM = /60|[1-5][0-9]|[0][0-9]/
13
+
14
+ MONTH_NAME_SHORT = /Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/
15
+ MONTH_NAME_FULL = /January|February|March|April|May|June|July|August|September|October|November|December/
16
+
17
+ WEEKDAY_NAME_SHORT = /Sun|Mon|Tue|Wed|Thu|Fri|Sat/
18
+ WEEKDAY_NAME_FULL = /Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday/
19
+ end
20
+
21
+ class GuessMatch
22
+ def initialize(delimiters, parts, part_options)
23
+ @delimiters = delimiters
24
+ @parts = parts
25
+ @part_options = part_options
26
+ end
27
+
28
+ def format
29
+ format = ''
30
+ @parts.size.times do |i|
31
+ format << @delimiters[i-1] if i != 0
32
+ option = @part_options[i]
33
+
34
+ case @parts[i]
35
+ when :year
36
+ format << '%Y'
37
+
38
+ when :month
39
+ case option
40
+ when :zero
41
+ format << '%m'
42
+ when :blank
43
+ #format << '%_m' # not supported
44
+ format << '%m'
45
+ when :none
46
+ #format << '%-m' # not supported
47
+ format << '%m'
48
+ else
49
+ format << '%m'
50
+ end
51
+
52
+ when :day
53
+ case option
54
+ when :zero
55
+ format << '%d'
56
+ when :blank
57
+ format << '%e'
58
+ when :none
59
+ format << '%d' # not supported
60
+ else
61
+ format << '%d'
62
+ end
63
+
64
+ when :hour
65
+ case option
66
+ when :zero
67
+ format << '%H'
68
+ when :blank
69
+ format << '%k'
70
+ when :none
71
+ format << '%k' # not supported
72
+ else
73
+ format << '%H'
74
+ end
75
+
76
+ when :minute
77
+ # heading options are not supported
78
+ format << '%M'
79
+
80
+ when :second
81
+ # heading options are not supported
82
+ format << '%S'
83
+
84
+ when :frac
85
+ if option <= 3
86
+ format << '%L'
87
+ #elsif option <= 6
88
+ # format << '%6N'
89
+ #elsif option <= 6
90
+ # format << '%6N'
91
+ #elsif option <= 9
92
+ # format << '%9N'
93
+ #elsif option <= 12
94
+ # format << '%12N'
95
+ #elsif option <= 15
96
+ # format << '%15N'
97
+ #elsif option <= 18
98
+ # format << '%18N'
99
+ #elsif option <= 21
100
+ # format << '%21N'
101
+ #elsif option <= 24
102
+ # format << '%24N'
103
+ else
104
+ format << '%N'
105
+ end
106
+
107
+ when :zone_off
108
+ format << '%z'
109
+
110
+ when :zone_abb
111
+ format << '%Z'
112
+
113
+ else
114
+ raise "Unknown part: #{@parts[i]}"
115
+ end
116
+ end
117
+
118
+ return format
119
+ end
120
+
121
+ def mergeable_group
122
+ [@delimiters, @parts]
123
+ end
124
+
125
+ attr_reader :part_options
126
+
127
+ def merge!(another_in_group)
128
+ part_options = another_in_group.part_options
129
+ @part_options.size.times do |i|
130
+ @part_options[i] ||= part_options[i]
131
+ if @part_options[i] == nil
132
+ part_options[i]
133
+ elsif part_options[i] == nil
134
+ @part_options[i]
135
+ else
136
+ [@part_options[i], part_options[i]].sort.last
137
+ end
138
+ end
139
+ end
140
+ end
141
+
142
+ class GuessPattern
143
+ include Parts
144
+
145
+ date_delims = /[\/\-]/
146
+ # yyyy-MM-dd
147
+ YMD = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
148
+ YMD_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
149
+ # dd/MM/yyyy
150
+ DMY = /(?<year>#{YEAR})(?<date_delim>#{date_delims})(?<month>#{MONTH})\k<date_delim>(?<day>#{DAY})/
151
+ DMY_NODELIM = /(?<year>#{YEAR})(?<month>#{MONTH_NODELIM})(?<day>#{DAY_NODELIM})/
152
+
153
+ frac = /[0-9]{1,24}/
154
+ time_delims = /[\:\-]/
155
+ frac_delims = /[\.\,]/
156
+ TIME = /(?<hour>#{HOUR})(?<time_delim>#{time_delims})(?<minute>#{MINUTE})(?:\k<time_delim>(?<second>#{SECOND})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
157
+ TIME_NODELIM = /(?<hour>#{HOUR_NODELIM})(?<minute>#{MINUTE_NODELIM})((?<second>#{SECOND_NODELIM})(?:(?<frac_delim>#{frac_delims})(?<frac>#{frac}))?)?/
158
+
159
+ TZ = /(?<zone_space> )?(?<zone>(?<zone_off>[\-\+]\d\d(?::?\d\d)?)|(?<zone_abb>[A-Z]{3}))|(?<z>Z)/
160
+
161
+ def match(text)
162
+ delimiters = []
163
+ parts = []
164
+ part_options = []
165
+
166
+ if dm = (/^#{YMD}(?<rest>.*?)$/.match(text) or /^#{YMD_NODELIM}(?<rest>.*?)$/.match(text))
167
+ date_delim = dm["date_delim"] rescue ""
168
+
169
+ parts << :year
170
+ part_options << nil
171
+ delimiters << date_delim
172
+
173
+ parts << :month
174
+ part_options << part_heading_option(dm["month"])
175
+ delimiters << date_delim
176
+
177
+ parts << :day
178
+ part_options << part_heading_option(dm["day"])
179
+
180
+ elsif dm = (/^#{DMY}(?<rest>.*?)$/.match(text) or /^#{DMY_NODELIM}(?<rest>.*?)$/.match(text))
181
+ date_delim = dm["date_delim"] rescue ""
182
+
183
+ parts << :day
184
+ part_options << part_heading_option(dm["day"])
185
+ delimiters << date_delim
186
+
187
+ parts << :month
188
+ part_options << part_heading_option(dm["month"])
189
+ delimiters << date_delim
190
+
191
+ parts << :year
192
+ part_options << nil
193
+ delimiters << date_delim
194
+
195
+ else
196
+ date_delim = ""
197
+ return nil
198
+ end
199
+ rest = dm["rest"]
200
+
201
+ date_time_delims = /[ _T]/
202
+ if tm = (
203
+ /^(?<date_time_delim>#{date_time_delims})#{TIME}(?<rest>.*?)?$/.match(rest) or
204
+ /^(?<date_time_delim>#{date_time_delims})#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest) or
205
+ (date_delim == "" && /^#{TIME_NODELIM}(?<rest>.*?)?$/.match(rest))
206
+ )
207
+ date_time_delim = tm["date_time_delim"] rescue ""
208
+ time_delim = tm["time_delim"] rescue ""
209
+
210
+ delimiters << date_time_delim
211
+ parts << :hour
212
+ part_options << part_heading_option(tm["hour"])
213
+
214
+ delimiters << time_delim
215
+ parts << :minute
216
+ part_options << part_heading_option(tm["minute"])
217
+
218
+ if tm["second"]
219
+ delimiters << time_delim
220
+ parts << :second
221
+ part_options << part_heading_option(tm["second"])
222
+ end
223
+
224
+ if tm["frac"]
225
+ delimiters << tm["frac_delim"]
226
+ parts << :frac
227
+ part_options << tm["frac"].size
228
+ end
229
+
230
+ rest = tm["rest"]
231
+ end
232
+
233
+ if zm = /^#{TZ}$/.match(rest)
234
+ delimiters << zm["zone_space"] || ''
235
+ if zm["z"]
236
+ # TODO ISO 8601
237
+ parts << :zone_off
238
+ elsif zm["zone_off"]
239
+ parts << :zone_off
240
+ else
241
+ parts << :zone_abb
242
+ end
243
+ part_options << nil
244
+
245
+ return GuessMatch.new(delimiters, parts, part_options)
246
+
247
+ elsif rest =~ /^\s*$/
248
+ return GuessMatch.new(delimiters, parts, part_options)
249
+
250
+ else
251
+ return nil
252
+ end
253
+ end
254
+
255
+ def part_heading_option(text)
256
+ if text[0] == '0'
257
+ :zero
258
+ elsif text[0] == ' '
259
+ :blank
260
+ elsif text.size == 1
261
+ :none
262
+ else
263
+ nil
264
+ end
265
+ end
266
+ end
267
+
268
+ class RegexpMatch
269
+ def initialize(format)
270
+ @format
271
+ end
272
+
273
+ attr_reader :format
274
+
275
+ def mergeable_group
276
+ @format
277
+ end
278
+
279
+ def merge!(another_in_group)
280
+ end
281
+ end
282
+
283
+ class RegexpPattern
284
+ def initialize(regexp, format)
285
+ @regexp = regexp
286
+ @match = RegexpMatch.new(format)
287
+ end
288
+
289
+ def match(text)
290
+ if @regexp =~ text
291
+ return @match
292
+ else
293
+ return nil
294
+ end
295
+ end
296
+ end
297
+
298
+ module StandardPatterns
299
+ include Parts
300
+
301
+ RFC_822_1123 = /^#{WEEKDAY_NAME_SHORT}, \d\d #{MONTH_NAME_SHORT} \d\d\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
302
+ RFC_850_1035 = /^#{WEEKDAY_NAME_FULL}, \d\d-#{MONTH_NAME_SHORT}-\d\d \d\d:\d\d:\d\d [a-zA-Z]{3}$/
303
+ APACHE_CLF = /^\d\d\/#{MONTH_NAME_SHORT}\/\d\d\d\d \d\d:\d\d:\d\d [\-\+]\d\d(?::?\d\d)?$/
304
+ ANSI_C_ASCTIME = /^#{WEEKDAY_NAME_SHORT} #{MONTH_NAME_SHORT} \d\d? \d\d:\d\d:\d\d \d\d\d\d$/
305
+ end
306
+
307
+ PATTERNS = [
308
+ GuessPattern.new,
309
+ RegexpPattern.new(StandardPatterns::RFC_822_1123, "%a, %d %b %Y %H:%M:%S %z"),
310
+ RegexpPattern.new(StandardPatterns::RFC_850_1035, "%A, %d-%b-%y %H:%M:%S %z"),
311
+ RegexpPattern.new(StandardPatterns::APACHE_CLF, "%d/%b/%Y %H:%M:%S %Z"),
312
+ RegexpPattern.new(StandardPatterns::ANSI_C_ASCTIME, "$a %b %e %H:%M:%S %Y"),
313
+ ]
314
+
315
+ def self.guess(texts)
316
+ texts = Array(texts).select {|text| text != "" }
317
+ matches = texts.map do |text|
318
+ PATTERNS.map {|pattern| pattern.match(text) }.compact
319
+ end.flatten
320
+ if matches.empty?
321
+ return nil
322
+ elsif matches.size == 1
323
+ return matches[0].format
324
+ else
325
+ match_groups = matches.group_by {|match| match.mergeable_group }
326
+ best_match_group = match_groups.sort_by {|group| group.size }.last[1]
327
+ best_match = best_match_group.shift
328
+ best_match_group.each {|m| best_match.merge!(m) }
329
+ return best_match.format
330
+ end
331
+ end
332
+ end
333
+ end
@@ -1,3 +1,3 @@
1
1
  module Embulk
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
@@ -267,6 +267,7 @@ files:
267
267
  - embulk-docs/src/release/release-0.3.2.rst
268
268
  - embulk-docs/src/release/release-0.4.0.rst
269
269
  - embulk-docs/src/release/release-0.4.1.rst
270
+ - embulk-docs/src/release/release-0.4.2.rst
270
271
  - embulk-standards/build.gradle
271
272
  - embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java
272
273
  - embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java
@@ -335,10 +336,11 @@ files:
335
336
  - lib/embulk/file_output_plugin.rb
336
337
  - lib/embulk/filter_plugin.rb
337
338
  - lib/embulk/formatter_plugin.rb
338
- - lib/embulk/guess_charset.rb
339
- - lib/embulk/guess_csv.rb
340
- - lib/embulk/guess_gzip.rb
341
- - lib/embulk/guess_newline.rb
339
+ - lib/embulk/guess/charset.rb
340
+ - lib/embulk/guess/csv.rb
341
+ - lib/embulk/guess/gzip.rb
342
+ - lib/embulk/guess/newline.rb
343
+ - lib/embulk/guess/time_format_guess.rb
342
344
  - lib/embulk/guess_plugin.rb
343
345
  - lib/embulk/input_plugin.rb
344
346
  - lib/embulk/java/bootstrap.rb
@@ -352,7 +354,6 @@ files:
352
354
  - lib/embulk/plugin.rb
353
355
  - lib/embulk/plugin_registry.rb
354
356
  - lib/embulk/schema.rb
355
- - lib/embulk/time_format_guess.rb
356
357
  - lib/embulk/version.rb
357
358
  - settings.gradle
358
359
  - classpath/annotations-3.0.0.jar
@@ -361,8 +362,8 @@ files:
361
362
  - classpath/bval-jsr303-0.5.jar
362
363
  - classpath/commons-beanutils-core-1.8.3.jar
363
364
  - classpath/commons-lang3-3.1.jar
364
- - classpath/embulk-core-0.4.1.jar
365
- - classpath/embulk-standards-0.4.1.jar
365
+ - classpath/embulk-core-0.4.2.jar
366
+ - classpath/embulk-standards-0.4.2.jar
366
367
  - classpath/guava-18.0.jar
367
368
  - classpath/guice-3.0.jar
368
369
  - classpath/guice-multibindings-3.0.jar
@@ -1,26 +0,0 @@
1
- module Embulk
2
-
3
- class GuessCharset < GuessPlugin
4
- Plugin.register_guess('charset', self)
5
-
6
- def guess(config, sample_buffer)
7
- # ICU4J
8
- detector = com.ibm.icu.text.CharsetDetector.new
9
- detector.setText(sample_buffer.to_java_bytes)
10
- best_match = detector.detect
11
- if best_match.getConfidence < 50
12
- name = "UTF-8"
13
- else
14
- name = best_match.getName
15
- if name == "ISO-8859-1"
16
- # ISO-8859-1 means ASCII which is a subset
17
- # of UTF-8 in most of cases due to lack of
18
- # sample data set
19
- name = "UTF-8"
20
- end
21
- end
22
- return {"parser" => {"charset" => name}}
23
- end
24
- end
25
-
26
- end
@@ -1,204 +0,0 @@
1
- module Embulk
2
- require_relative 'time_format_guess'
3
-
4
- class GuessCsv < LineGuessPlugin
5
- Plugin.register_guess('csv', self)
6
-
7
- DELIMITER_CANDIDATES = [
8
- ",", "\t", "|"
9
- ]
10
-
11
- QUOTE_CANDIDATES = [
12
- "\"", "'"
13
- ]
14
-
15
- # CsvParserPlugin.TRUE_STRINGS
16
- TRUE_STRINGS = Hash[*%w[
17
- true True TRUE
18
- yes Yes YES
19
- y Y
20
- on On ON
21
- 1
22
- ].map {|k| [k, true] }]
23
-
24
- def guess_lines(config, sample_lines)
25
- delim = guess_delimiter(sample_lines)
26
- unless delim
27
- # not CSV file
28
- return {}
29
- end
30
-
31
- parser_config = config["parser"] || {}
32
- parser_guessed = {"type" => "csv", "delimiter" => delim}
33
-
34
- quote = guess_quote(sample_lines, delim)
35
- parser_guessed["quote"] = quote ? quote : ''
36
-
37
- sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
38
- first_types = guess_field_types(sample_records[0, 1])
39
- other_types = guess_field_types(sample_records[1..-1])
40
-
41
- if first_types.size <= 1 || other_types.size <= 1
42
- # guess failed
43
- return {}
44
- end
45
-
46
- unless parser_config.has_key?("header_line")
47
- parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
48
- end
49
-
50
- unless parser_config.has_key?("columns")
51
- if parser_guessed["header_line"] || parser_config["header_line"]
52
- column_names = sample_records.first
53
- else
54
- column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
55
- end
56
- schema = []
57
- column_names.zip(other_types).each do |name,(type,format)|
58
- if name && type
59
- if format
60
- schema << {"name" => name, "type" => type, "format" => format}
61
- else
62
- schema << {"name" => name, "type" => type}
63
- end
64
- end
65
- end
66
- parser_guessed["columns"] = schema
67
- end
68
-
69
- return {"parser" => parser_guessed}
70
- end
71
-
72
- private
73
-
74
- def guess_delimiter(sample_lines)
75
- delim_weights = DELIMITER_CANDIDATES.map do |d|
76
- counts = sample_lines.map {|line| line.count(d) }
77
- total = array_sum(counts)
78
- if total > 0
79
- stddev = array_standard_deviation(counts)
80
- stddev = 0.000000001 if stddev == 0.0
81
- weight = total / stddev
82
- [d, weight]
83
- else
84
- [nil, 0]
85
- end
86
- end
87
-
88
- delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
89
- if delim != nil && weight > 1
90
- return delim
91
- else
92
- return nil
93
- end
94
- end
95
-
96
- def guess_quote(sample_lines, delim)
97
- delim_regexp = Regexp.escape(delim)
98
- quote_weights = QUOTE_CANDIDATES.map do |q|
99
- weights = sample_lines.map do |line|
100
- q_regexp = Regexp.escape(q)
101
- count = line.count(q)
102
- if count > 0
103
- weight = count
104
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
105
- weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
106
- weight
107
- else
108
- nil
109
- end
110
- end.compact
111
- weights.empty? ? 0 : array_avg(weights)
112
- end
113
- quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
114
- if weight >= 10.0
115
- return quote
116
- else
117
- return nil
118
- end
119
- end
120
-
121
- def guess_field_types(field_lines)
122
- column_lines = []
123
- field_lines.each do |fields|
124
- fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
125
- end
126
- columns = column_lines.map do |types|
127
- t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
128
- if t.is_a?(TimestampMatch)
129
- format = TimeFormatGuess.guess(types.map {|type| type.text })
130
- ["timestamp", format]
131
- else
132
- [t]
133
- end
134
- end
135
- return columns
136
- end
137
-
138
- TYPE_COALESCE = Hash[{
139
- long: :double,
140
- boolean: :long,
141
- }.map {|k,v|
142
- [[k.to_s, v.to_s].sort, v.to_s]
143
- }]
144
-
145
- def merge_type(type1, type2)
146
- if type1 == type2
147
- type1
148
- elsif type1.nil? || type2.nil?
149
- type1 || type2
150
- else
151
- TYPE_COALESCE[[type1, type2].sort] || "string"
152
- end
153
- end
154
-
155
- class TimestampMatch < String
156
- def initialize(text)
157
- super("timestamp")
158
- @text = text
159
- end
160
- attr_reader :text
161
- end
162
-
163
- def guess_type(str)
164
- if TRUE_STRINGS[str]
165
- return "boolean"
166
- end
167
-
168
- if TimeFormatGuess.guess(str)
169
- return TimestampMatch.new(str)
170
- end
171
-
172
- if str.to_i.to_s == str
173
- return "long"
174
- end
175
-
176
- if str.include?('.')
177
- a, b = str.split(".", 2)
178
- if a.to_i.to_s == a && b.to_i.to_s == b
179
- return "double"
180
- end
181
- end
182
-
183
- return "string"
184
- end
185
-
186
- def array_sum(array)
187
- array.inject(0) {|r,i| r += i }
188
- end
189
-
190
- def array_avg(array)
191
- array.inject(0.0) {|r,i| r += i } / array.size
192
- end
193
-
194
- def array_variance(array)
195
- avg = array_avg(array)
196
- array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
197
- end
198
-
199
- def array_standard_deviation(array)
200
- Math.sqrt(array_variance(array))
201
- end
202
- end
203
-
204
- end