embulk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,16 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.util.List;
4
+ import com.google.common.collect.ImmutableList;
5
+ import com.google.inject.Module;
6
+ import org.embulk.spi.Extension;
7
+ import org.embulk.config.ConfigSource;
8
+
9
+ public class StandardPluginExtension
10
+ implements Extension
11
+ {
12
+ public List<Module> getModules(ConfigSource systemConfig)
13
+ {
14
+ return ImmutableList.<Module>of(new StandardPluginModule());
15
+ }
16
+ }
@@ -0,0 +1,44 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Preconditions;
4
+ import com.google.inject.Binder;
5
+ import com.google.inject.Module;
6
+ import com.google.inject.name.Names;
7
+ import org.embulk.spi.FormatterPlugin;
8
+ import org.embulk.spi.InputPlugin;
9
+ import org.embulk.spi.OutputPlugin;
10
+ import org.embulk.spi.ParserPlugin;
11
+ import org.embulk.spi.DecoderPlugin;
12
+ import org.embulk.spi.EncoderPlugin;
13
+ import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
14
+
15
+ public class StandardPluginModule
16
+ implements Module
17
+ {
18
+ @Override
19
+ public void configure(Binder binder)
20
+ {
21
+ Preconditions.checkNotNull(binder, "binder is null.");
22
+
23
+ // input plugins
24
+ registerPluginTo(binder, InputPlugin.class, "file", LocalFileInputPlugin.class);
25
+ registerPluginTo(binder, InputPlugin.class, "s3_file", S3FileInputPlugin.class);
26
+
27
+ // parser plugins
28
+ registerPluginTo(binder, ParserPlugin.class, "csv", CsvParserPlugin.class);
29
+
30
+ // file decoder plugins
31
+ registerPluginTo(binder, DecoderPlugin.class, "gzip", GzipFileDecoderPlugin.class);
32
+
33
+ // output plugins
34
+ registerPluginTo(binder, OutputPlugin.class, "file", LocalFileOutputPlugin.class);
35
+ registerPluginTo(binder, OutputPlugin.class, "null", NullOutputPlugin.class);
36
+ registerPluginTo(binder, OutputPlugin.class, "stdout", StdoutOutputPlugin.class);
37
+
38
+ // formatter plugins
39
+ registerPluginTo(binder, FormatterPlugin.class, "csv", CsvFormatterPlugin.class);
40
+
41
+ // file encoder plugins
42
+ registerPluginTo(binder, EncoderPlugin.class, "gzip", GzipFileEncoderPlugin.class);
43
+ }
44
+ }
@@ -0,0 +1,71 @@
1
+ package org.embulk.standards;
2
+
3
+ import org.embulk.config.ConfigSource;
4
+ import org.embulk.config.TaskSource;
5
+ import org.embulk.config.NextConfig;
6
+ import org.embulk.config.CommitReport;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.spi.time.TimestampFormatter;
9
+ import org.embulk.spi.Schema;
10
+ import org.embulk.spi.SchemaVisitor;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.Page;
13
+ import org.embulk.spi.Exec;
14
+ import org.embulk.spi.OutputPlugin;
15
+ import org.embulk.spi.TransactionalPageOutput;
16
+ import org.embulk.spi.PageReader;
17
+ import org.embulk.spi.util.PagePrinter;
18
+
19
+ public class StdoutOutputPlugin
20
+ implements OutputPlugin
21
+ {
22
+ public interface PluginTask
23
+ extends Task, TimestampFormatter.FormatterTask
24
+ {
25
+ }
26
+
27
+ @Override
28
+ public NextConfig transaction(ConfigSource config,
29
+ Schema schema, int processorCount,
30
+ OutputPlugin.Control control)
31
+ {
32
+ final PluginTask task = config.loadConfig(PluginTask.class);
33
+ control.run(task.dump());
34
+ return Exec.newNextConfig();
35
+ }
36
+
37
+ @Override
38
+ public TransactionalPageOutput open(TaskSource taskSource, final Schema schema,
39
+ int processorIndex)
40
+ {
41
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
42
+
43
+ return new TransactionalPageOutput() {
44
+ private final PageReader reader = new PageReader(schema);
45
+ private final PagePrinter printer = new PagePrinter(schema, task);
46
+
47
+ public void add(Page page)
48
+ {
49
+ reader.setPage(page);
50
+ while (reader.nextRecord()) {
51
+ System.out.println(printer.printRecord(reader, ","));
52
+ }
53
+ page.release();
54
+ }
55
+
56
+ public void finish()
57
+ {
58
+ System.out.flush();
59
+ }
60
+
61
+ public void close() { }
62
+
63
+ public void abort() { }
64
+
65
+ public CommitReport commit()
66
+ {
67
+ return Exec.newCommitReport();
68
+ }
69
+ };
70
+ }
71
+ }
@@ -0,0 +1 @@
1
+ org.embulk.standards.StandardPluginExtension
@@ -0,0 +1,69 @@
1
+ package org.embulk.standards;
2
+
3
+ import org.junit.Rule;
4
+ import org.junit.Before;
5
+ import org.junit.Test;
6
+ import static org.junit.Assert.assertEquals;
7
+ import java.nio.charset.Charset;
8
+ import com.google.common.collect.ImmutableList;
9
+ import com.google.common.collect.ImmutableMap;
10
+ import org.embulk.EmbulkTestRuntime;
11
+ import org.embulk.config.ConfigException;
12
+ import org.embulk.config.ConfigSource;
13
+ import org.embulk.spi.Exec;
14
+ import org.embulk.spi.util.Newline;
15
+
16
+ public class TestCsvParserPlugin
17
+ {
18
+ @Rule
19
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
20
+
21
+ @Test
22
+ public void checkDefaultValues()
23
+ {
24
+ ConfigSource config = Exec.newConfigSource()
25
+ .set("columns", ImmutableList.of(
26
+ ImmutableMap.of(
27
+ "name", "date_code",
28
+ "type", "string"))
29
+ );
30
+
31
+ CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
32
+ assertEquals(Charset.forName("utf-8"), task.getCharset());
33
+ assertEquals(Newline.CRLF, task.getNewline());
34
+ assertEquals(false, task.getHeaderLine());
35
+ assertEquals(',', task.getDelimiterChar());
36
+ assertEquals('\"', task.getQuoteChar());
37
+ }
38
+
39
+ @Test(expected = ConfigException.class)
40
+ public void checkColumnsRequired()
41
+ {
42
+ ConfigSource config = Exec.newConfigSource();
43
+
44
+ config.loadConfig(CsvParserPlugin.PluginTask.class);
45
+ }
46
+
47
+ @Test
48
+ public void checkLoadConfig()
49
+ {
50
+ ConfigSource config = Exec.newConfigSource()
51
+ .set("charset", "utf-16")
52
+ .set("newline", "LF")
53
+ .set("header_line", true)
54
+ .set("delimiter", "\t")
55
+ .set("quote", "\\")
56
+ .set("columns", ImmutableList.of(
57
+ ImmutableMap.of(
58
+ "name", "date_code",
59
+ "type", "string"))
60
+ );
61
+
62
+ CsvParserPlugin.PluginTask task = config.loadConfig(CsvParserPlugin.PluginTask.class);
63
+ assertEquals(Charset.forName("utf-16"), task.getCharset());
64
+ assertEquals(Newline.LF, task.getNewline());
65
+ assertEquals(true, task.getHeaderLine());
66
+ assertEquals('\t', task.getDelimiterChar());
67
+ assertEquals('\\', task.getQuoteChar());
68
+ }
69
+ }
@@ -0,0 +1,291 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.nio.ByteBuffer;
4
+ import java.nio.charset.Charset;
5
+ import java.nio.charset.UnsupportedCharsetException;
6
+ import java.util.ArrayList;
7
+ import java.util.Arrays;
8
+ import java.util.List;
9
+ import com.fasterxml.jackson.databind.node.JsonNodeFactory;
10
+ import com.google.common.collect.ImmutableList;
11
+ import com.google.common.collect.ImmutableMap;
12
+ import org.junit.Before;
13
+ import org.junit.Rule;
14
+ import org.junit.Test;
15
+ import static org.junit.Assert.assertEquals;
16
+ import org.embulk.EmbulkTestRuntime;
17
+ import org.embulk.config.ConfigSource;
18
+ import org.embulk.spi.Buffer;
19
+ import org.embulk.spi.FileInput;
20
+ import org.embulk.spi.Column;
21
+ import org.embulk.spi.Schema;
22
+ import org.embulk.spi.Exec;
23
+ import org.embulk.spi.util.LineDecoder;
24
+ import org.embulk.spi.util.ListFileInput;
25
+
26
+ public class TestCsvTokenizer
27
+ {
28
+ @Rule
29
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
30
+
31
+ protected ConfigSource config;
32
+ protected CsvParserPlugin.PluginTask task;
33
+
34
+ @Before
35
+ public void setup() {
36
+ config = Exec.newConfigSource()
37
+ .set("newline", "LF")
38
+ .set("columns", ImmutableList.of(
39
+ ImmutableMap.of(
40
+ "name", "date_code",
41
+ "type", "string"),
42
+ ImmutableMap.of(
43
+ "name", "foo",
44
+ "type", "string"))
45
+ );
46
+ reloadPluginTask();
47
+ }
48
+
49
+ private void reloadPluginTask()
50
+ {
51
+ task = config.loadConfig(CsvParserPlugin.PluginTask.class);
52
+ }
53
+
54
+ private static FileInput newFileInputFromLines(CsvParserPlugin.PluginTask task, String... lines)
55
+ {
56
+ List<Buffer> buffers = new ArrayList<>();
57
+ for (String line : lines) {
58
+ byte[] buffer = (line + task.getNewline().getString()).getBytes(task.getCharset());
59
+ buffers.add(Buffer.wrap(buffer));
60
+ }
61
+ return new ListFileInput(ImmutableList.of(buffers));
62
+ }
63
+
64
+ private static FileInput newFileInputFromText(CsvParserPlugin.PluginTask task, String text)
65
+ {
66
+ return new ListFileInput(
67
+ ImmutableList.of(ImmutableList.of(
68
+ Buffer.wrap(text.getBytes(task.getCharset())))));
69
+ }
70
+
71
+ private static List<List<String>> parse(CsvParserPlugin.PluginTask task, String... lines)
72
+ {
73
+ return parse(task, newFileInputFromLines(task, lines));
74
+ }
75
+
76
+ private static List<List<String>> parse(CsvParserPlugin.PluginTask task, FileInput input)
77
+ {
78
+ LineDecoder decoder = new LineDecoder(input, task);
79
+ CsvTokenizer tokenizer = new CsvTokenizer(decoder, task);
80
+ Schema schema = task.getSchemaConfig().toSchema();
81
+
82
+ tokenizer.nextFile();
83
+
84
+ List<List<String>> records = new ArrayList<>();
85
+ while (tokenizer.nextRecord()) {
86
+ List<String> record = new ArrayList<>();
87
+ for (Column c : schema.getColumns()) {
88
+ String v = tokenizer.nextColumn();
89
+ if (!v.isEmpty()) {
90
+ record.add(v);
91
+ } else {
92
+ record.add(tokenizer.wasQuotedColumn() ? "" : null);
93
+ }
94
+ }
95
+ records.add(record);
96
+ }
97
+ return records;
98
+ }
99
+
100
+ private List<List<String>> expectedRecords(int columnCount, String... values)
101
+ {
102
+ List<List<String>> records = new ArrayList<>();
103
+ List<String> columns = null;
104
+ for (int i=0; i < values.length; i++) {
105
+ if (i % columnCount == 0) {
106
+ columns = new ArrayList<String>();
107
+ records.add(columns);
108
+ }
109
+ columns.add(values[i]);
110
+ }
111
+ return records;
112
+ }
113
+
114
+ @Test
115
+ public void testSimple() throws Exception
116
+ {
117
+ assertEquals(expectedRecords(2,
118
+ "aaa", "bbb",
119
+ "ccc", "ddd"),
120
+ parse(task,
121
+ "aaa,bbb",
122
+ "ccc,ddd"));
123
+ }
124
+
125
+ @Test
126
+ public void testSkipEmptyLine() throws Exception
127
+ {
128
+ assertEquals(expectedRecords(2,
129
+ "aaa", "bbb",
130
+ "ccc", "ddd"),
131
+ parse(task,
132
+ "", "aaa,bbb", "", "",
133
+ "ccc,ddd", "", ""));
134
+ }
135
+
136
+ @Test
137
+ public void parseEmptyColumnsToNull() throws Exception
138
+ {
139
+ assertEquals(expectedRecords(2,
140
+ null, null,
141
+ "", "",
142
+ " ", " "), // not trimmed
143
+ parse(task,
144
+ ",",
145
+ "\"\",\"\"",
146
+ " , "));
147
+ }
148
+
149
+ @Test
150
+ public void parseEmptyColumnsToNullTrimmed() throws Exception
151
+ {
152
+ config.set("trim_if_not_quoted", true);
153
+ reloadPluginTask();
154
+ assertEquals(
155
+ expectedRecords(2,
156
+ null, null,
157
+ "", "",
158
+ null, null), // trimmed
159
+ parse(task,
160
+ ",",
161
+ "\"\",\"\"",
162
+ " , "));
163
+ }
164
+
165
+ @Test
166
+ public void testMultilineQuotedValueWithEmptyLine() throws Exception
167
+ {
168
+ assertEquals(expectedRecords(2,
169
+ "a", "\nb\n\n",
170
+ "c", "d"),
171
+ parse(task,
172
+ "",
173
+ "a,\"", "b", "", "\"",
174
+ "c,d"));
175
+ }
176
+
177
+ @Test
178
+ public void testEndOfFileWithoutNewline() throws Exception
179
+ {
180
+ // In RFC 4180, the last record in the file may or may not have
181
+ // an ending line break.
182
+ assertEquals(expectedRecords(2,
183
+ "aaa", "bbb",
184
+ "ccc", "ddd"),
185
+ parse(task, newFileInputFromText(task,
186
+ "aaa,bbb\nccc,ddd")));
187
+ }
188
+
189
+ @Test
190
+ public void testChangeDelimiter() throws Exception
191
+ {
192
+ config.set("delimiter", JsonNodeFactory.instance.textNode("\t")); // TSV format
193
+ reloadPluginTask();
194
+ assertEquals(expectedRecords(2,
195
+ "aaa", "bbb",
196
+ "ccc", "ddd"),
197
+ parse(task,
198
+ "aaa\tbbb",
199
+ "ccc\tddd"));
200
+ }
201
+
202
+ @Test
203
+ public void testQuotedValues() throws Exception
204
+ {
205
+ assertEquals(expectedRecords(2,
206
+ "a\na\na", "b,bb",
207
+ "cc\"c", "\"ddd",
208
+ null, ""),
209
+ parse(task, newFileInputFromText(task,
210
+ "\n\"a\na\na\",\"b,bb\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
211
+ }
212
+
213
+ @Test
214
+ public void parseEscapedValues() throws Exception
215
+ {
216
+ assertEquals(expectedRecords(2,
217
+ "a\"aa", "b,bb\"",
218
+ "cc\"c", "\"ddd",
219
+ null, ""),
220
+ parse(task, newFileInputFromText(task,
221
+ "\n\"a\\\"aa\",\"b,bb\\\"\"\n\n\"cc\"\"c\",\"\"\"ddd\"\n,\"\"\n")));
222
+ }
223
+
224
+ @Test
225
+ public void trimNonQuotedValues() throws Exception
226
+ {
227
+ assertEquals(expectedRecords(2,
228
+ " aaa ", " b cd ",
229
+ " ccc","dd d \n "), // quoted values are not changed
230
+ parse(task, newFileInputFromText(task,
231
+ " aaa , b cd \n\" ccc\",\"dd d \n \"")));
232
+
233
+ // trim_if_not_quoted is true
234
+ config.set("trim_if_not_quoted", true);
235
+ reloadPluginTask();
236
+ assertEquals(expectedRecords(2,
237
+ "aaa", "b cd",
238
+ " ccc","dd d \n "), // quoted values are not changed
239
+ parse(task, newFileInputFromText(task,
240
+ " aaa , b cd \n\" ccc\",\"dd d \n \"")));
241
+ }
242
+
243
+ @Test
244
+ public void parseQuotedValueWithSpacesAndTrimmingOption() throws Exception
245
+ {
246
+ config.set("trim_if_not_quoted", true);
247
+ reloadPluginTask();
248
+ assertEquals(expectedRecords(2,
249
+ "heading1", "heading2",
250
+ "trailing1","trailing2",
251
+ "trailing\n3","trailing\n4"),
252
+ parse(task,
253
+ " \"heading1\", \"heading2\"",
254
+ "\"trailing1\" ,\"trailing2\" ",
255
+ "\"trailing\n3\" ,\"trailing\n4\" "));
256
+ }
257
+
258
+ /*
259
+ @Test(expected = CsvTokenizer.CsvValueValidateException.class)
260
+ public void parseTooLargeSizedValues() throws Exception
261
+ {
262
+ config.set("max_quoted_column_size", 8L);
263
+ reloadPluginTask();
264
+ List<List<String>> parsed = doParse(task, bufferList("utf-8",
265
+ "aaa,bbb", "\n", "\"cccccccc\",ddd", "\n"));
266
+
267
+ assertEquals(Arrays.asList(
268
+ Arrays.asList("aaa", "bbb"),
269
+ Arrays.asList("ccc", "ddd")),
270
+ parsed);
271
+ }
272
+ */
273
+
274
+ /*
275
+ @Test
276
+ public void parseEscapedQuotedValues() throws Exception
277
+ {
278
+ List<List<String>> parsed = doParse(task, bufferList("utf-8",
279
+ "\"aa,a\",\",aaa\",\"aaa,\"", "\n",
280
+ "\"bb\"\"b\",\"\"\"bbb\",\"bbb\"\"\"", "\n",
281
+ "\"cc\\\"c\",\"\\\"ccc\",\"ccc\\\"\"", "\n",
282
+ "\"dd\nd\",\"\nddd\",\"ddd\n\"", "\n"));
283
+ assertEquals(Arrays.asList(
284
+ Arrays.asList("aa,a", ",aaa", "aaa,"),
285
+ Arrays.asList("bb\"b", "\"bbb", "bbb\""),
286
+ Arrays.asList("cc\"c", "\"ccc", "ccc\""),
287
+ Arrays.asList("dd\nd", "\nddd", "ddd\n")),
288
+ parsed);
289
+ }
290
+ */
291
+ }