embulk 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,16 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.util.List;
4
+ import com.google.common.collect.ImmutableList;
5
+ import com.google.inject.Module;
6
+ import org.embulk.spi.Extension;
7
+ import org.embulk.config.ConfigSource;
8
+
9
+ public class StandardPluginExtension
10
+ implements Extension
11
+ {
12
+ public List<Module> getModules(ConfigSource systemConfig)
13
+ {
14
+ return ImmutableList.<Module>of(new StandardPluginModule());
15
+ }
16
+ }
@@ -0,0 +1,44 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Preconditions;
4
+ import com.google.inject.Binder;
5
+ import com.google.inject.Module;
6
+ import com.google.inject.name.Names;
7
+ import org.embulk.spi.FormatterPlugin;
8
+ import org.embulk.spi.InputPlugin;
9
+ import org.embulk.spi.OutputPlugin;
10
+ import org.embulk.spi.ParserPlugin;
11
+ import org.embulk.spi.DecoderPlugin;
12
+ import org.embulk.spi.EncoderPlugin;
13
+ import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
14
+
15
+ public class StandardPluginModule
16
+ implements Module
17
+ {
18
+ @Override
19
+ public void configure(Binder binder)
20
+ {
21
+ Preconditions.checkNotNull(binder, "binder is null.");
22
+
23
+ // input plugins
24
+ registerPluginTo(binder, InputPlugin.class, "file", LocalFileInputPlugin.class);
25
+ registerPluginTo(binder, InputPlugin.class, "s3_file", S3FileInputPlugin.class);
26
+
27
+ // parser plugins
28
+ registerPluginTo(binder, ParserPlugin.class, "csv", CsvParserPlugin.class);
29
+
30
+ // file decoder plugins
31
+ registerPluginTo(binder, DecoderPlugin.class, "gzip", GzipFileDecoderPlugin.class);
32
+
33
+ // output plugins
34
+ registerPluginTo(binder, OutputPlugin.class, "file", LocalFileOutputPlugin.class);
35
+ registerPluginTo(binder, OutputPlugin.class, "null", NullOutputPlugin.class);
36
+ registerPluginTo(binder, OutputPlugin.class, "stdout", StdoutOutputPlugin.class);
37
+
38
+ // formatter plugins
39
+ registerPluginTo(binder, FormatterPlugin.class, "csv", CsvFormatterPlugin.class);
40
+
41
+ // file encoder plugins
42
+ registerPluginTo(binder, EncoderPlugin.class, "gzip", GzipFileEncoderPlugin.class);
43
+ }
44
+ }
@@ -0,0 +1,71 @@
1
+ package org.embulk.standards;
2
+
3
+ import org.embulk.config.ConfigSource;
4
+ import org.embulk.config.TaskSource;
5
+ import org.embulk.config.NextConfig;
6
+ import org.embulk.config.CommitReport;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.spi.time.TimestampFormatter;
9
+ import org.embulk.spi.Schema;
10
+ import org.embulk.spi.SchemaVisitor;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.Page;
13
+ import org.embulk.spi.Exec;
14
+ import org.embulk.spi.OutputPlugin;
15
+ import org.embulk.spi.TransactionalPageOutput;
16
+ import org.embulk.spi.PageReader;
17
+ import org.embulk.spi.util.PagePrinter;
18
+
19
+ public class StdoutOutputPlugin
20
+ implements OutputPlugin
21
+ {
22
+ public interface PluginTask
23
+ extends Task, TimestampFormatter.FormatterTask
24
+ {
25
+ }
26
+
27
+ @Override
28
+ public NextConfig transaction(ConfigSource config,
29
+ Schema schema, int processorCount,
30
+ OutputPlugin.Control control)
31
+ {
32
+ final PluginTask task = config.loadConfig(PluginTask.class);
33
+ control.run(task.dump());
34
+ return Exec.newNextConfig();
35
+ }
36
+
37
+ @Override
38
+ public TransactionalPageOutput open(TaskSource taskSource, final Schema schema,
39
+ int processorIndex)
40
+ {
41
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
42
+
43
+ return new TransactionalPageOutput() {
44
+ private final PageReader reader = new PageReader(schema);
45
+ private final PagePrinter printer = new PagePrinter(schema, task);
46
+
47
+ public void add(Page page)
48
+ {
49
+ reader.setPage(page);
50
+ while (reader.nextRecord()) {
51
+ System.out.println(printer.printRecord(reader, ","));
52
+ }
53
+ page.release();
54
+ }
55
+
56
+ public void finish()
57
+ {
58
+ System.out.flush();
59
+ }
60
+
61
+ public void close() { }
62
+
63
+ public void abort() { }
64
+
65
+ public CommitReport commit()
66
+ {
67
+ return Exec.newCommitReport();
68
+ }
69
+ };
70
+ }
71
+ }
@@ -0,0 +1 @@
1
+ org.embulk.standards.StandardPluginExtension
@@ -0,0 +1,69 @@
1
+ package org.embulk.standards;
2
+
3
+ import org.junit.Rule;
4
+ import org.junit.Before;
5
+ import org.junit.Test;
6
+ import static org.junit.Assert.assertEquals;
7
+ import java.nio.charset.Charset;
8
+ import com.google.common.collect.ImmutableList;
9
+ import com.google.common.collect.ImmutableMap;
10
+ import org.embulk.EmbulkTestRuntime;
11
+ import org.embulk.config.ConfigException;
12
+ import org.embulk.config.ConfigSource;
13
+ import org.embulk.spi.Exec;
14
+ import org.embulk.spi.util.Newline;
15
+
16
+ public class TestCsvParserPlugin
17
+ {
18
+ @Rule
19
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
20
+
21
+ @Test
22
+ public void checkDefaultValues()
23
+ {
24
+ ConfigSource config = Exec.newConfigSource()
25
+ .set("columns", ImmutableList.of(
26
+ ImmutableMap.of(
27
+ "name", "date_code",
28
+ "type", "string"))
29
+ );
30
+
31
+ CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
32
+ assertEquals(Charset.forName("utf-8"), task.getCharset());
33
+ assertEquals(Newline.CRLF, task.getNewline());
34
+ assertEquals(false, task.getHeaderLine());
35
+ assertEquals(',', task.getDelimiterChar());
36
+ assertEquals('\"', task.getQuoteChar());
37
+ }
38
+
39
+ @Test(expected = ConfigException.class)
40
+ public void checkColumnsRequired()
41
+ {
42
+ ConfigSource config = Exec.newConfigSource();
43
+
44
+ config.loadConfig(CsvParserPlugin.PluginTask.class);
45
+ }
46
+
47
+ @Test
48
+ public void checkLoadConfig()
49
+ {
50
+ ConfigSource config = Exec.newConfigSource()
51
+ .set("charset", "utf-16")
52
+ .set("newline", "LF")
53
+ .set("header_line", true)
54
+ .set("delimiter", "\t")
55
+ .set("quote", "\\")
56
+ .set("columns", ImmutableList.of(
57
+ ImmutableMap.of(
58
+ "name", "date_code",
59
+ "type", "string"))
60
+ );
61
+
62
+ CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
63
+ assertEquals(Charset.forName("utf-16"), task.getCharset());
64
+ assertEquals(Newline.LF, task.getNewline());
65
+ assertEquals(true, task.getHeaderLine());
66
+ assertEquals('\t', task.getDelimiterChar());
67
+ assertEquals('\\', task.getQuoteChar());
68
+ }
69
+ }
@@ -0,0 +1,291 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.nio.ByteBuffer;
4
+ import java.nio.charset.Charset;
5
+ import java.nio.charset.UnsupportedCharsetException;
6
+ import java.util.ArrayList;
7
+ import java.util.Arrays;
8
+ import java.util.List;
9
+ import com.fasterxml.jackson.databind.node.JsonNodeFactory;
10
+ import com.google.common.collect.ImmutableList;
11
+ import com.google.common.collect.ImmutableMap;
12
+ import org.junit.Before;
13
+ import org.junit.Rule;
14
+ import org.junit.Test;
15
+ import static org.junit.Assert.assertEquals;
16
+ import org.embulk.EmbulkTestRuntime;
17
+ import org.embulk.config.ConfigSource;
18
+ import org.embulk.spi.Buffer;
19
+ import org.embulk.spi.FileInput;
20
+ import org.embulk.spi.Column;
21
+ import org.embulk.spi.Schema;
22
+ import org.embulk.spi.Exec;
23
+ import org.embulk.spi.util.LineDecoder;
24
+ import org.embulk.spi.util.ListFileInput;
25
+
26
+ public class TestCsvTokenizer
27
+ {
28
+ @Rule
29
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
30
+
31
+ protected ConfigSource config;
32
+ protected CsvParserPlugin.PluginTask task;
33
+
34
+ @Before
35
+ public void setup() {
36
+ config = Exec.newConfigSource()
37
+ .set("newline", "LF")
38
+ .set("columns", ImmutableList.of(
39
+ ImmutableMap.of(
40
+ "name", "date_code",
41
+ "type", "string"),
42
+ ImmutableMap.of(
43
+ "name", "foo",
44
+ "type", "string"))
45
+ );
46
+ reloadPluginTask();
47
+ }
48
+
49
+ private void reloadPluginTask()
50
+ {
51
+ task = config.loadConfig(CsvParserPlugin.PluginTask.class);
52
+ }
53
+
54
+ private static FileInput newFileInputFromLines(CsvParserPlugin.PluginTask task, String... lines)
55
+ {
56
+ List<Buffer> buffers = new ArrayList<>();
57
+ for (String line : lines) {
58
+ byte[] buffer = (line + task.getNewline().getString()).getBytes(task.getCharset());
59
+ buffers.add(Buffer.wrap(buffer));
60
+ }
61
+ return new ListFileInput(ImmutableList.of(buffers));
62
+ }
63
+
64
+ private static FileInput newFileInputFromText(CsvParserPlugin.PluginTask task, String text)
65
+ {
66
+ return new ListFileInput(
67
+ ImmutableList.of(ImmutableList.of(
68
+ Buffer.wrap(text.getBytes(task.getCharset())))));
69
+ }
70
+
71
+ private static List<List<String>> parse(CsvParserPlugin.PluginTask task, String... lines)
72
+ {
73
+ return parse(task, newFileInputFromLines(task, lines));
74
+ }
75
+
76
+ private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input)
77
+ {
78
+ LineDecoder decoder = new LineDecoder(input, task);
79
+ CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
80
+ Schema schema = task.getSchemaConfig().toSchema();
81
+
82
+ tokenizer.nextFile();
83
+
84
+ List<List<String>> records = new ArrayList<>();
85
+ while (tokenizer.nextRecord()) {
86
+ List<String> record = new ArrayList<>();
87
+ for (Column c : schema.getColumns()) {
88
+ String v = tokenizer.nextColumn();
89
+ if (!v.isEmpty()) {
90
+ record.add(v);
91
+ } else {
92
+ record.add(tokenizer.wasQuotedColumn() ? "" : null);
93
+ }
94
+ }
95
+ records.add(record);
96
+ }
97
+ return records;
98
+ }
99
+
100
+ private List<List<String>> expectedRecords(int columnCount, String... values)
101
+ {
102
+ List<List<String>> records = new ArrayList<>();
103
+ List<String> columns = null;
104
+ for (int i=0; i < values.length; i++) {
105
+ if (i % columnCount == 0) {
106
+ columns = new ArrayList<String>();
107
+ records.add(columns);
108
+ }
109
+ columns.add(values[i]);
110
+ }
111
+ return records;
112
+ }
113
+
114
+ @Test
115
+ public void testSimple() throws Exception
116
+ {
117
+ assertEquals(expectedRecords(2,
118
+ "aaa", "bbb",
119
+ "ccc", "ddd"),
120
+ parse(task,
121
+ "aaa,bbb",
122
+ "ccc,ddd"));
123
+ }
124
+
125
+ @Test
126
+ public void testSkipEmptyLine() throws Exception
127
+ {
128
+ assertEquals(expectedRecords(2,
129
+ "aaa", "bbb",
130
+ "ccc", "ddd"),
131
+ parse(task,
132
+ "", "aaa,bbb", "", "",
133
+ "ccc,ddd", "", ""));
134
+ }
135
+
136
+ @Test
137
+ public void parseEmptyColumnsToNull() throws Exception
138
+ {
139
+ assertEquals(expectedRecords(2,
140
+ null, null,
141
+ "", "",
142
+ " ", " "), // not trimmed
143
+ parse(task,
144
+ ",",
145
+ "\"\",\"\"",
146
+ " , "));
147
+ }
148
+
149
+ @Test
150
+ public void parseEmptyColumnsToNullTrimmed() throws Exception
151
+ {
152
+ config.set("trim_if_not_quoted", true);
153
+ reloadPluginTask();
154
+ assertEquals(
155
+ expectedRecords(2,
156
+ null, null,
157
+ "", "",
158
+ null, null), // trimmed
159
+ parse(task,
160
+ ",",
161
+ "\"\",\"\"",
162
+ " , "));
163
+ }
164
+
165
+ @Test
166
+ public void testMultilineQuotedValueWithEmptyLine() throws Exception
167
+ {
168
+ assertEquals(expectedRecords(2,
169
+ "a", "\nb\n\n",
170
+ "c", "d"),
171
+ parse(task,
172
+ "",
173
+ "a,\"", "b", "", "\"",
174
+ "c,d"));
175
+ }
176
+
177
+ @Test
178
+ public void testEndOfFileWithoutNewline() throws Exception
179
+ {
180
+ // In RFC 4180, the last record in the file may or may not have
181
+ // an ending line break.
182
+ assertEquals(expectedRecords(2,
183
+ "aaa", "bbb",
184
+ "ccc", "ddd"),
185
+ parse(task, newFileInputFromText(task,
186
+ "aaa,bbb\nccc,ddd")));
187
+ }
188
+
189
+ @Test
190
+ public void testChangeDelimiter() throws Exception
191
+ {
192
+ config.set("delimiter", JsonNodeFactory.instance.textNode("\t")); // TSV format
193
+ reloadPluginTask();
194
+ assertEquals(expectedRecords(2,
195
+ "aaa", "bbb",
196
+ "ccc", "ddd"),
197
+ parse(task,
198
+ "aaa\tbbb",
199
+ "ccc\tddd"));
200
+ }
201
+
202
+ @Test
203
+ public void testQuotedValues() throws Exception
204
+ {
205
+ assertEquals(expectedRecords(2,
206
+ "a\na\na", "b,bb",
207
+ "cc\"c", "\"ddd",
208
+ null, ""),
209
+ parse(task, newFileInputFromText(task,
210
+ "\n\"a\na\na\",\"b,bb\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
211
+ }
212
+
213
+ @Test
214
+ public void parseEscapedValues() throws Exception
215
+ {
216
+ assertEquals(expectedRecords(2,
217
+ "a\"aa", "b,bb\"",
218
+ "cc\"c", "\"ddd",
219
+ null, ""),
220
+ parse(task, newFileInputFromText(task,
221
+ "\n\"a\\\"aa\",\"b,bb\\\"\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
222
+ }
223
+
224
+ @Test
225
+ public void trimNonQuotedValues() throws Exception
226
+ {
227
+ assertEquals(expectedRecords(2,
228
+ " aaa ", " b cd ",
229
+ " ccc","dd d \n "), // quoted values are not changed
230
+ parse(task, newFileInputFromText(task,
231
+ " aaa , b cd \n\" ccc\",\"dd d \n \"")));
232
+
233
+ // trim_if_not_quoted is true
234
+ config.set("trim_if_not_quoted", true);
235
+ reloadPluginTask();
236
+ assertEquals(expectedRecords(2,
237
+ "aaa", "b cd",
238
+ " ccc","dd d \n "), // quoted values are not changed
239
+ parse(task, newFileInputFromText(task,
240
+ " aaa , b cd \n\" ccc\",\"dd d \n \"")));
241
+ }
242
+
243
+ @Test
244
+ public void parseQuotedValueWithSpacesAndTrimmingOption() throws Exception
245
+ {
246
+ config.set("trim_if_not_quoted", true);
247
+ reloadPluginTask();
248
+ assertEquals(expectedRecords(2,
249
+ "heading1", "heading2",
250
+ "trailing1","trailing2",
251
+ "trailing\n3","trailing\n4"),
252
+ parse(task,
253
+ " \"heading1\", \"heading2\"",
254
+ "\"trailing1\" ,\"trailing2\" ",
255
+ "\"trailing\n3\" ,\"trailing\n4\" "));
256
+ }
257
+
258
+ /*
259
+ @Test(expected = CsvTokenizer.CsvValueValidateException.class)
260
+ public void parseTooLargeSizedValues() throws Exception
261
+ {
262
+ config.set("max_quoted_column_size", 8L);
263
+ reloadPluginTask();
264
+ List<List<String>> parsed = doParse(task, bufferList("utf-8",
265
+ "aaa,bbb", "\n", "\"cccccccc\",ddd", "\n"));
266
+
267
+ assertEquals(Arrays.asList(
268
+ Arrays.asList("aaa", "bbb"),
269
+ Arrays.asList("ccc", "ddd")),
270
+ parsed);
271
+ }
272
+ */
273
+
274
+ /*
275
+ @Test
276
+ public void parseEscapedQuotedValues() throws Exception
277
+ {
278
+ List<List<String>> parsed = doParse(task, bufferList("utf-8",
279
+ "\"aa,a\",\",aaa\",\"aaa,\"", "\n",
280
+ "\"bb\"\"b\",\"\"\"bbb\",\"bbb\"\"\"", "\n",
281
+ "\"cc\\\"c\",\"\\\"ccc\",\"ccc\\\"\"", "\n",
282
+ "\"dd\nd\",\"\nddd\",\"ddd\n\"", "\n"));
283
+ assertEquals(Arrays.asList(
284
+ Arrays.asList("aa,a", ",aaa", "aaa,"),
285
+ Arrays.asList("bb\"b", "\"bbb", "bbb\""),
286
+ Arrays.asList("cc\"c", "\"ccc", "ccc\""),
287
+ Arrays.asList("dd\nd", "\nddd", "ddd\n")),
288
+ parsed);
289
+ }
290
+ */
291
+ }