embulk-parser-csv_with_default_value 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +54 -0
  5. data/build.gradle +96 -0
  6. data/config/checkstyle/checkstyle.xml +128 -0
  7. data/config/checkstyle/default.xml +108 -0
  8. data/gradlew +160 -0
  9. data/gradlew.bat +90 -0
  10. data/lib/embulk/guess/csv_with_default_value.rb +61 -0
  11. data/lib/embulk/parser/csv_with_default_value.rb +3 -0
  12. data/src/main/java/org/embulk/parser/csv_with_default_value/ColumnDefaultValue.java +123 -0
  13. data/src/main/java/org/embulk/parser/csv_with_default_value/ColumnDefaultValueImpl.java +68 -0
  14. data/src/main/java/org/embulk/parser/csv_with_default_value/CsvRecordValidateException.java +13 -0
  15. data/src/main/java/org/embulk/parser/csv_with_default_value/CsvTokenizer.java +512 -0
  16. data/src/main/java/org/embulk/parser/csv_with_default_value/CsvWithDefaultValueParserPlugin.java +447 -0
  17. data/src/test/java/org/embulk/EmbulkTestRuntime.java +113 -0
  18. data/src/test/java/org/embulk/GuiceBinder.java +72 -0
  19. data/src/test/java/org/embulk/RandomManager.java +53 -0
  20. data/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  21. data/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  22. data/src/test/java/org/embulk/parser/csv_with_default_value/TestCsvWithDefaultValueParserPlugin.java +97 -0
  23. data/src/test/java/org/embulk/parser/csv_with_default_value/ValueTypeTest.java +47 -0
  24. data/src/test/java/org/embulk/spi/MockFormatterPlugin.java +108 -0
  25. data/src/test/java/org/embulk/spi/MockParserPlugin.java +80 -0
  26. metadata +97 -0
@@ -0,0 +1,447 @@
1
+ package org.embulk.parser.csv_with_default_value;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.collect.ImmutableSet;
5
+ import com.fasterxml.jackson.annotation.JsonCreator;
6
+ import com.fasterxml.jackson.annotation.JsonIgnore;
7
+ import com.fasterxml.jackson.annotation.JsonValue;
8
+ import org.embulk.config.Task;
9
+ import org.embulk.config.Config;
10
+ import org.embulk.config.ConfigDefault;
11
+ import org.embulk.config.ConfigSource;
12
+ import org.embulk.config.ConfigException;
13
+ import org.embulk.config.TaskSource;
14
+ import org.embulk.spi.*;
15
+ import org.embulk.spi.time.TimestampParser;
16
+ import org.embulk.spi.time.TimestampParseException;
17
+ import org.embulk.spi.json.JsonParser;
18
+ import org.embulk.spi.json.JsonParseException;
19
+ import org.embulk.spi.util.LineDecoder;
20
+ import org.embulk.spi.util.Timestamps;
21
+ import org.slf4j.Logger;
22
+
23
+ import java.util.Map;
24
+
25
+ public class CsvWithDefaultValueParserPlugin
26
+ implements ParserPlugin
27
+ {
28
+ private static final ImmutableSet<String> TRUE_STRINGS =
29
+ ImmutableSet.of(
30
+ "true", "True", "TRUE",
31
+ "yes", "Yes", "YES",
32
+ "t", "T", "y", "Y",
33
+ "on", "On", "ON",
34
+ "1");
35
+
36
+ public interface PluginTask
37
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
38
+ {
39
+ @Config("columns")
40
+ SchemaConfig getSchemaConfig();
41
+
42
+ @Config("header_line")
43
+ @ConfigDefault("null")
44
+ Optional<Boolean> getHeaderLine();
45
+
46
+ @Config("skip_header_lines")
47
+ @ConfigDefault("0")
48
+ int getSkipHeaderLines();
49
+ void setSkipHeaderLines(int n);
50
+
51
+ @Config("delimiter")
52
+ @ConfigDefault("\",\"")
53
+ String getDelimiter();
54
+
55
+ @Config("quote")
56
+ @ConfigDefault("\"\\\"\"")
57
+ Optional<QuoteCharacter> getQuoteChar();
58
+
59
+ @Config("escape")
60
+ @ConfigDefault("\"\\\\\"")
61
+ Optional<EscapeCharacter> getEscapeChar();
62
+
63
+ // Null value handling: if the CsvParser found 'non-quoted empty string's,
64
+ // it replaces them to string that users specified like "\N", "NULL".
65
+ @Config("null_string")
66
+ @ConfigDefault("null")
67
+ Optional<String> getNullString();
68
+
69
+ @Config("trim_if_not_quoted")
70
+ @ConfigDefault("false")
71
+ boolean getTrimIfNotQuoted();
72
+
73
+ @Config("max_quoted_size_limit")
74
+ @ConfigDefault("131072") //128kB
75
+ long getMaxQuotedSizeLimit();
76
+
77
+ @Config("comment_line_marker")
78
+ @ConfigDefault("null")
79
+ Optional<String> getCommentLineMarker();
80
+
81
+ @Config("allow_optional_columns")
82
+ @ConfigDefault("false")
83
+ boolean getAllowOptionalColumns();
84
+
85
+ @Config("allow_extra_columns")
86
+ @ConfigDefault("false")
87
+ boolean getAllowExtraColumns();
88
+
89
+ @Config("stop_on_invalid_record")
90
+ @ConfigDefault("false")
91
+ boolean getStopOnInvalidRecord();
92
+
93
+ @Config("default_values")
94
+ @ConfigDefault("{}")
95
+ Map<String, ColumnDefaultValue> getDefaultValues();
96
+
97
+ }
98
+
99
+ public static class QuoteCharacter
100
+ {
101
+ private final char character;
102
+
103
+ public QuoteCharacter(char character)
104
+ {
105
+ this.character = character;
106
+ }
107
+
108
+ public static QuoteCharacter noQuote()
109
+ {
110
+ return new QuoteCharacter(CsvTokenizer.NO_QUOTE);
111
+ }
112
+
113
+ @JsonCreator
114
+ public static QuoteCharacter ofString(String str)
115
+ {
116
+ if (str.length() >= 2) {
117
+ throw new ConfigException("\"quote\" option accepts only 1 character.");
118
+ } else if (str.isEmpty()) {
119
+ Exec.getLogger(CsvWithDefaultValueParserPlugin.class).warn("Setting '' (empty string) to \"quote\" option is obsoleted. Currently it becomes '\"' automatically but this behavior will be removed. Please set '\"' explicitly.");
120
+ return new QuoteCharacter('"');
121
+ } else {
122
+ return new QuoteCharacter(str.charAt(0));
123
+ }
124
+ }
125
+
126
+ @JsonIgnore
127
+ public char getCharacter()
128
+ {
129
+ return character;
130
+ }
131
+
132
+ @JsonValue
133
+ public String getOptionalString()
134
+ {
135
+ return new String(new char[] { character });
136
+ }
137
+
138
+ @Override
139
+ public boolean equals(Object obj)
140
+ {
141
+ if (!(obj instanceof QuoteCharacter)) {
142
+ return false;
143
+ }
144
+ QuoteCharacter o = (QuoteCharacter) obj;
145
+ return character == o.character;
146
+ }
147
+ }
148
+
149
+ public static class EscapeCharacter
150
+ {
151
+ private final char character;
152
+
153
+ public EscapeCharacter(char character)
154
+ {
155
+ this.character = character;
156
+ }
157
+
158
+ public static EscapeCharacter noEscape()
159
+ {
160
+ return new EscapeCharacter(CsvTokenizer.NO_ESCAPE);
161
+ }
162
+
163
+ @JsonCreator
164
+ public static EscapeCharacter ofString(String str)
165
+ {
166
+ if (str.length() >= 2) {
167
+ throw new ConfigException("\"escape\" option accepts only 1 character.");
168
+ } else if (str.isEmpty()) {
169
+ Exec.getLogger(CsvWithDefaultValueParserPlugin.class).warn("Setting '' (empty string) to \"escape\" option is obsoleted. Currently it becomes null automatically but this behavior will be removed. Please set \"escape: null\" explicitly.");
170
+ return noEscape();
171
+ } else {
172
+ return new EscapeCharacter(str.charAt(0));
173
+ }
174
+ }
175
+
176
+ @JsonIgnore
177
+ public char getCharacter()
178
+ {
179
+ return character;
180
+ }
181
+
182
+ @JsonValue
183
+ public String getOptionalString()
184
+ {
185
+ return new String(new char[] { character });
186
+ }
187
+
188
+ @Override
189
+ public boolean equals(Object obj)
190
+ {
191
+ if (!(obj instanceof EscapeCharacter)) {
192
+ return false;
193
+ }
194
+ EscapeCharacter o = (EscapeCharacter) obj;
195
+ return character == o.character;
196
+ }
197
+ }
198
+
199
+ private final Logger log;
200
+
201
+ public CsvWithDefaultValueParserPlugin()
202
+ {
203
+ log = Exec.getLogger(CsvWithDefaultValueParserPlugin.class);
204
+ }
205
+
206
+ @Override
207
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
208
+ {
209
+ PluginTask task = config.loadConfig(PluginTask.class);
210
+
211
+ // backward compatibility
212
+ if (task.getHeaderLine().isPresent()) {
213
+ if (task.getSkipHeaderLines() > 0) {
214
+ throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set.");
215
+ }
216
+ if (task.getHeaderLine().get()) {
217
+ task.setSkipHeaderLines(1);
218
+ } else {
219
+ task.setSkipHeaderLines(0);
220
+ }
221
+ }
222
+
223
+ control.run(task.dump(), task.getSchemaConfig().toSchema());
224
+ }
225
+
226
+ @Override
227
+ public void run(TaskSource taskSource, final Schema schema,
228
+ FileInput input, PageOutput output)
229
+ {
230
+ PluginTask task = taskSource.loadTask(PluginTask.class);
231
+ final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
232
+ final JsonParser jsonParser = new JsonParser();
233
+ final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
234
+ final boolean allowOptionalColumns = task.getAllowOptionalColumns();
235
+ final boolean allowExtraColumns = task.getAllowExtraColumns();
236
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
237
+ int skipHeaderLines = task.getSkipHeaderLines();
238
+
239
+ try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
240
+ while (tokenizer.nextFile()) {
241
+ // skip the header lines for each file
242
+ for (; skipHeaderLines > 0; skipHeaderLines--) {
243
+ if (!tokenizer.skipHeaderLine()) {
244
+ break;
245
+ }
246
+ }
247
+
248
+ if (!tokenizer.nextRecord()) {
249
+ // empty file
250
+ continue;
251
+ }
252
+
253
+ while (true) {
254
+ boolean hasNextRecord;
255
+
256
+ try {
257
+ schema.visitColumns(new DefaultValueAwareColumnVisitor(pageBuilder, task, tokenizer, timestampParsers));
258
+
259
+ try {
260
+ hasNextRecord = tokenizer.nextRecord();
261
+ } catch (CsvTokenizer.TooManyColumnsException ex) {
262
+ if (allowExtraColumns) {
263
+ String tooManyColumnsLine = tokenizer.skipCurrentLine();
264
+ // TODO warning
265
+ hasNextRecord = tokenizer.nextRecord();
266
+ } else {
267
+ // this line will be skipped at the following catch section
268
+ throw ex;
269
+ }
270
+ }
271
+ pageBuilder.addRecord();
272
+
273
+ } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
274
+ String skippedLine = tokenizer.skipCurrentLine();
275
+ long lineNumber = tokenizer.getCurrentLineNumber();
276
+ if (stopOnInvalidRecord) {
277
+ throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
278
+ }
279
+ log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
280
+ //exec.notice().skippedLine(skippedLine);
281
+
282
+ hasNextRecord = tokenizer.nextRecord();
283
+ }
284
+
285
+ if (!hasNextRecord) {
286
+ break;
287
+ }
288
+ }
289
+ }
290
+
291
+ pageBuilder.finish();
292
+ }
293
+
294
+ }
295
+
296
+ static class DefaultValueAwareColumnVisitor implements ColumnVisitor {
297
+
298
+ private final PageBuilder pageBuilder;
299
+ private final PluginTask task;
300
+ private final TimestampParser[] timestampParsers;
301
+ private final JsonParser jsonParser;
302
+ private final boolean allowOptionalColumns;
303
+ private final CsvTokenizer tokenizer;
304
+ private final Logger log = Exec.getLogger(CsvWithDefaultValueParserPlugin.class);
305
+
306
+ DefaultValueAwareColumnVisitor(PageBuilder pageBuilder, PluginTask task, CsvTokenizer tokenizer, TimestampParser[] timestampParsers) {
307
+ this.pageBuilder = pageBuilder;
308
+ this.timestampParsers = timestampParsers;
309
+ this.jsonParser = new JsonParser();
310
+ this.tokenizer = tokenizer;
311
+ this.allowOptionalColumns = task.getAllowOptionalColumns();
312
+ this.task = task;
313
+ assertDefaultValuesAreAllowedForTypes();
314
+ }
315
+
316
+ private void assertDefaultValuesAreAllowedForTypes(){
317
+ for(Map.Entry<String, ColumnDefaultValue> e: task.getDefaultValues().entrySet()){
318
+ ColumnConfig col = task.getSchemaConfig().lookupColumn(e.getKey());
319
+ if(col == null){
320
+ throw new ConfigException(String.format("column %s is not found.", e.getKey()));
321
+ }else if(!ColumnDefaultValue.ALLOWED_TYPES.contains(col.getType())){
322
+ throw new ConfigException(String.format("default value are allowed for only %s", ColumnDefaultValue.ALLOWED_TYPES_NAME));
323
+ }
324
+ }
325
+ }
326
+
327
+ public void booleanColumn(Column column)
328
+ {
329
+ String v = nextColumn();
330
+ if (v == null) {
331
+ pageBuilder.setNull(column);
332
+ } else {
333
+ pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
334
+ }
335
+ }
336
+
337
+ public void longColumn(Column column)
338
+ {
339
+ String v = nextColumn();
340
+ if (v == null) {
341
+ pageBuilder.setNull(column);
342
+ } else {
343
+ try {
344
+ pageBuilder.setLong(column, Long.parseLong(v));
345
+ } catch (NumberFormatException e) {
346
+ final Optional<ColumnDefaultValue> defaultValue = getDefaultValue(task, column);
347
+ if(defaultValue.isPresent()){
348
+ defaultValue.get().getType().longValue(defaultValue.get(), pageBuilder, column);
349
+ log.warn(String.format("Applying default value due to fail to parse: %s(%s)", v, column.getName()));
350
+ }else {
351
+ throw new CsvRecordValidateException(e);
352
+ }
353
+
354
+ }
355
+ }
356
+ }
357
+
358
+ public void doubleColumn(Column column)
359
+ {
360
+ String v = nextColumn();
361
+ if (v == null) {
362
+ pageBuilder.setNull(column);
363
+ } else {
364
+ try {
365
+ pageBuilder.setDouble(column, Double.parseDouble(v));
366
+ } catch (NumberFormatException e) {
367
+ final Optional<ColumnDefaultValue> defaultValue = getDefaultValue(task, column);
368
+ if(defaultValue.isPresent()){
369
+ defaultValue.get().getType().doubleValue(defaultValue.get(), pageBuilder, column);
370
+ log.warn(String.format("Applying default value due to fail to parse: %s(%s)", v, column.getName()));
371
+ }else {
372
+ throw new CsvRecordValidateException(e);
373
+ } }
374
+ }
375
+ }
376
+
377
+ public void stringColumn(Column column)
378
+ {
379
+ String v = nextColumn();
380
+ if (v == null) {
381
+ pageBuilder.setNull(column);
382
+ } else {
383
+ pageBuilder.setString(column, v);
384
+ }
385
+ }
386
+
387
+ public void timestampColumn(Column column)
388
+ {
389
+ String v = nextColumn();
390
+ if (v == null) {
391
+ pageBuilder.setNull(column);
392
+ } else {
393
+ try {
394
+ pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
395
+ } catch (TimestampParseException e) {
396
+ final Optional<ColumnDefaultValue> defaultValue = getDefaultValue(task, column);
397
+ if(defaultValue.isPresent()){
398
+ defaultValue.get().getType().timestampValue(defaultValue.get(), timestampParsers[column.getIndex()], pageBuilder, column);
399
+ log.warn(String.format("Applying default value due to fail to parse: %s(%s)", v, column.getName()));
400
+ }else{
401
+ throw new CsvRecordValidateException(e);
402
+ }
403
+
404
+ }
405
+ }
406
+ }
407
+
408
+ public void jsonColumn(Column column)
409
+ {
410
+ String v = nextColumn();
411
+ if (v == null) {
412
+ pageBuilder.setNull(column);
413
+ } else {
414
+ try {
415
+ pageBuilder.setJson(column, jsonParser.parse(v));
416
+ } catch (JsonParseException e) {
417
+ // TODO support default value
418
+ throw new CsvRecordValidateException(e);
419
+ }
420
+ }
421
+ }
422
+
423
+ private String nextColumn()
424
+ {
425
+ if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
426
+ //TODO warning
427
+ return null;
428
+ }
429
+ return tokenizer.nextColumnOrNull();
430
+ }
431
+
432
+ protected Optional<ColumnDefaultValue> getDefaultValue(final PluginTask task, final Column column){
433
+ final ColumnDefaultValue value = task.getDefaultValues().get(column.getName());
434
+ if(value == null){
435
+ return Optional.absent();
436
+ }
437
+ //Check values is set if immediate
438
+ if(value.getType() == ColumnDefaultValue.ValueType.IMMEDIATE && !value.getDefaultValue().isPresent()){
439
+ throw new ConfigException(String.format("default_value is not set to column '%s'", column.getName()));
440
+ }else if(value.getType() == ColumnDefaultValue.ValueType.NULL && value.getDefaultValue().isPresent()){
441
+ throw new ConfigException(String.format("default_value is set to column '%s', even though type is null.", column.getName()));
442
+ }
443
+ return Optional.of(value);
444
+ }
445
+
446
+ }
447
+ }
@@ -0,0 +1,113 @@
1
+ package org.embulk;
2
+
3
+ import java.util.Random;
4
+ import org.junit.runner.Description;
5
+ import org.junit.runners.model.Statement;
6
+ import com.google.inject.Injector;
7
+ import com.google.inject.Binder;
8
+ import com.google.inject.Module;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.DataSourceImpl;
11
+ import org.embulk.config.ModelManager;
12
+ import org.embulk.exec.SystemConfigModule;
13
+ import org.embulk.exec.ExecModule;
14
+ import org.embulk.exec.ExtensionServiceLoaderModule;
15
+ import org.embulk.plugin.BuiltinPluginSourceModule;
16
+ import org.embulk.jruby.JRubyScriptingModule;
17
+ import org.embulk.spi.BufferAllocator;
18
+ import org.embulk.spi.Exec;
19
+ import org.embulk.spi.ExecAction;
20
+ import org.embulk.spi.ExecSession;
21
+
22
+ public class EmbulkTestRuntime extends GuiceBinder
23
+ {
24
+ private static ConfigSource getSystemConfig()
25
+ {
26
+ // TODO set some default values
27
+ return new DataSourceImpl(null);
28
+ }
29
+
30
+ public static class TestRuntimeModule
31
+ implements Module
32
+ {
33
+ @Override
34
+ public void configure(Binder binder)
35
+ {
36
+ ConfigSource systemConfig = getSystemConfig();
37
+ new SystemConfigModule(systemConfig).configure(binder);
38
+ new ExecModule().configure(binder);
39
+ new ExtensionServiceLoaderModule(systemConfig).configure(binder);
40
+ new BuiltinPluginSourceModule().configure(binder);
41
+ new JRubyScriptingModule(systemConfig).configure(binder);
42
+ new TestUtilityModule().configure(binder);
43
+ new TestPluginSourceModule().configure(binder);
44
+ }
45
+ }
46
+
47
+ private ExecSession exec;
48
+
49
+ public EmbulkTestRuntime()
50
+ {
51
+ super(new TestRuntimeModule());
52
+ Injector injector = getInjector();
53
+ ConfigSource execConfig = new DataSourceImpl(injector.getInstance(ModelManager.class));
54
+ this.exec = ExecSession.builder(injector).fromExecConfig(execConfig).build();
55
+ }
56
+
57
+ public ExecSession getExec()
58
+ {
59
+ return exec;
60
+ }
61
+
62
+ public BufferAllocator getBufferAllocator()
63
+ {
64
+ return getInstance(BufferAllocator.class);
65
+ }
66
+
67
+ public ModelManager getModelManager()
68
+ {
69
+ return getInstance(ModelManager.class);
70
+ }
71
+
72
+ public Random getRandom()
73
+ {
74
+ return getInstance(RandomManager.class).getRandom();
75
+ }
76
+
77
+ @Override
78
+ public Statement apply(Statement base, Description description)
79
+ {
80
+ final Statement superStatement = EmbulkTestRuntime.super.apply(base, description);
81
+ return new Statement() {
82
+ public void evaluate() throws Throwable
83
+ {
84
+ try {
85
+ Exec.doWith(exec, new ExecAction<Void>() {
86
+ public Void run()
87
+ {
88
+ try {
89
+ superStatement.evaluate();
90
+ } catch (Throwable ex) {
91
+ throw new RuntimeExecutionException(ex);
92
+ }
93
+ return null;
94
+ }
95
+ });
96
+ } catch (RuntimeException ex) {
97
+ throw ex.getCause();
98
+ } finally {
99
+ exec.cleanup();
100
+ }
101
+ }
102
+ };
103
+ }
104
+
105
+ private static class RuntimeExecutionException
106
+ extends RuntimeException
107
+ {
108
+ public RuntimeExecutionException(Throwable cause)
109
+ {
110
+ super(cause);
111
+ }
112
+ }
113
+ }