embulk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,6 @@
1
+ dependencies {
2
+ compile project(':embulk-core')
3
+ compile 'com.amazonaws:aws-java-sdk:1.5.2'
4
+
5
+ testCompile project(':embulk-core').sourceSets.test.output
6
+ }
@@ -0,0 +1,68 @@
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3
+ <modelVersion>4.0.0</modelVersion>
4
+
5
+ <parent>
6
+ <groupId>org.embulk</groupId>
7
+ <artifactId>embulk-parent</artifactId>
8
+ <version>0.1.0-SNAPSHOT</version>
9
+ </parent>
10
+
11
+ <artifactId>embulk-standards</artifactId>
12
+ <name>embulk-standards</name>
13
+
14
+ <dependencies>
15
+ <dependency>
16
+ <groupId>org.embulk</groupId>
17
+ <artifactId>embulk-core</artifactId>
18
+ </dependency>
19
+
20
+ <dependency>
21
+ <groupId>org.embulk</groupId>
22
+ <artifactId>embulk-core</artifactId>
23
+ <type>test-jar</type>
24
+ <scope>test</scope>
25
+ </dependency>
26
+
27
+ <dependency>
28
+ <groupId>com.google.guava</groupId>
29
+ <artifactId>guava</artifactId>
30
+ </dependency>
31
+
32
+ <dependency>
33
+ <groupId>com.google.inject</groupId>
34
+ <artifactId>guice</artifactId>
35
+ </dependency>
36
+
37
+ <dependency>
38
+ <groupId>javax.validation</groupId>
39
+ <artifactId>validation-api</artifactId>
40
+ </dependency>
41
+
42
+ <dependency>
43
+ <groupId>com.fasterxml.jackson.core</groupId>
44
+ <artifactId>jackson-databind</artifactId>
45
+ </dependency>
46
+
47
+ <dependency>
48
+ <groupId>org.slf4j</groupId>
49
+ <artifactId>slf4j-api</artifactId>
50
+ </dependency>
51
+
52
+ <dependency>
53
+ <groupId>com.amazonaws</groupId>
54
+ <artifactId>aws-java-sdk</artifactId>
55
+ <version>1.5.2</version>
56
+ </dependency>
57
+
58
+ <dependency>
59
+ <groupId>junit</groupId>
60
+ <artifactId>junit</artifactId>
61
+ </dependency>
62
+
63
+ <dependency>
64
+ <groupId>org.mockito</groupId>
65
+ <artifactId>mockito-core</artifactId>
66
+ </dependency>
67
+ </dependencies>
68
+ </project>
@@ -0,0 +1,158 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.collect.ImmutableBiMap;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import org.embulk.config.Config;
6
+ import org.embulk.config.ConfigDefault;
7
+ import org.embulk.spi.type.TimestampType;
8
+ import org.embulk.spi.time.Timestamp;
9
+ import org.embulk.spi.time.TimestampFormatter;
10
+ import org.embulk.config.TaskSource;
11
+ import org.embulk.config.ConfigSource;
12
+ import org.embulk.spi.Column;
13
+ import org.embulk.spi.Schema;
14
+ import org.embulk.spi.SchemaVisitor;
15
+ import org.embulk.spi.FormatterPlugin;
16
+ import org.embulk.spi.Page;
17
+ import org.embulk.spi.PageOutput;
18
+ import org.embulk.spi.PageReader;
19
+ import org.embulk.spi.Exec;
20
+ import org.embulk.spi.FileOutput;
21
+ import org.embulk.spi.util.LineEncoder;
22
+
23
+ import java.util.Map;
24
+
25
+ public class CsvFormatterPlugin
26
+ implements FormatterPlugin
27
+ {
28
+ public interface PluginTask
29
+ extends LineEncoder.EncoderTask, TimestampFormatter.FormatterTask
30
+ {
31
+ @Config("header_line")
32
+ @ConfigDefault("true")
33
+ public boolean getHeaderLine();
34
+ }
35
+
36
+ @Override
37
+ public void transaction(ConfigSource config, Schema schema,
38
+ FormatterPlugin.Control control)
39
+ {
40
+ PluginTask task = config.loadConfig(PluginTask.class);
41
+ control.run(task.dump());
42
+ }
43
+
44
+ private Map<Integer, TimestampFormatter> newTimestampFormatters(
45
+ TimestampFormatter.FormatterTask task, Schema schema)
46
+ {
47
+ ImmutableMap.Builder<Integer, TimestampFormatter> builder = new ImmutableBiMap.Builder<>();
48
+ for (Column column : schema.getColumns()) {
49
+ if (column.getType() instanceof TimestampType) {
50
+ TimestampType tt = (TimestampType) column.getType();
51
+ builder.put(column.getIndex(), new TimestampFormatter(tt.getFormat(), task));
52
+ }
53
+ }
54
+ return builder.build();
55
+ }
56
+
57
+ @Override
58
+ public PageOutput open(TaskSource taskSource, final Schema schema,
59
+ FileOutput output)
60
+ {
61
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
62
+ final LineEncoder encoder = new LineEncoder(output, task);
63
+ final Map<Integer, TimestampFormatter> timestampFormatters =
64
+ newTimestampFormatters(task, schema);
65
+
66
+ // create a file
67
+ encoder.nextFile();
68
+
69
+ // write header
70
+ if (task.getHeaderLine()) {
71
+ writeHeader(schema, encoder);
72
+ }
73
+
74
+ return new PageOutput() {
75
+ private final PageReader pageReader = new PageReader(schema);
76
+
77
+ public void add(Page page)
78
+ {
79
+ pageReader.setPage(page);
80
+ while (pageReader.nextRecord()) {
81
+ schema.visitColumns(new SchemaVisitor() {
82
+ public void booleanColumn(Column column)
83
+ {
84
+ addDelimiter(column);
85
+ if (!pageReader.isNull(column)) {
86
+ encoder.addText(Boolean.toString(pageReader.getBoolean(column)));
87
+ }
88
+ }
89
+
90
+ public void longColumn(Column column)
91
+ {
92
+ addDelimiter(column);
93
+ if (!pageReader.isNull(column)) {
94
+ encoder.addText(Long.toString(pageReader.getLong(column)));
95
+ }
96
+ }
97
+
98
+ public void doubleColumn(Column column)
99
+ {
100
+ addDelimiter(column);
101
+ if (!pageReader.isNull(column)) {
102
+ encoder.addText(Double.toString(pageReader.getDouble(column)));
103
+ }
104
+ }
105
+
106
+ public void stringColumn(Column column)
107
+ {
108
+ addDelimiter(column);
109
+ if (!pageReader.isNull(column)) {
110
+ // TODO escape and quoting
111
+ encoder.addText(pageReader.getString(column));
112
+ }
113
+ }
114
+
115
+ public void timestampColumn(Column column)
116
+ {
117
+ addDelimiter(column);
118
+ if (!pageReader.isNull(column)) {
119
+ Timestamp value = pageReader.getTimestamp(column);
120
+ encoder.addText(timestampFormatters.get(column.getIndex()).format(value));
121
+ }
122
+ }
123
+
124
+ private void addDelimiter(Column column)
125
+ {
126
+ if (column.getIndex() != 0) {
127
+ encoder.addText(",");
128
+ }
129
+ }
130
+ });
131
+
132
+ encoder.addNewLine();
133
+ }
134
+ }
135
+
136
+ public void finish()
137
+ {
138
+ encoder.finish();
139
+ }
140
+
141
+ public void close()
142
+ {
143
+ encoder.close();
144
+ }
145
+ };
146
+ }
147
+
148
+ private void writeHeader(Schema schema, LineEncoder encoder)
149
+ {
150
+ for (Column column : schema.getColumns()) {
151
+ if (column.getIndex() != 0) {
152
+ encoder.addText(",");
153
+ }
154
+ encoder.addText(column.getName());
155
+ }
156
+ encoder.addNewLine();
157
+ }
158
+ }
@@ -0,0 +1,233 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Preconditions;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import com.google.common.base.Optional;
6
+ import org.embulk.config.Task;
7
+ import org.embulk.config.Config;
8
+ import org.embulk.config.ConfigDefault;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.config.TaskSource;
11
+ import org.embulk.spi.type.TimestampType;
12
+ import org.embulk.spi.time.TimestampParser;
13
+ import org.embulk.spi.time.TimestampParseException;
14
+ import org.embulk.spi.Column;
15
+ import org.embulk.spi.Schema;
16
+ import org.embulk.spi.SchemaConfig;
17
+ import org.embulk.spi.SchemaVisitor;
18
+ import org.embulk.spi.PageBuilder;
19
+ import org.embulk.spi.ParserPlugin;
20
+ import org.embulk.spi.Exec;
21
+ import org.embulk.spi.FileInput;
22
+ import org.embulk.spi.PageOutput;
23
+ import org.embulk.spi.BufferAllocator;
24
+ import org.embulk.spi.util.LineDecoder;
25
+ import org.slf4j.Logger;
26
+
27
+ import java.util.Map;
28
+
29
+ public class CsvParserPlugin
30
+ implements ParserPlugin
31
+ {
32
+ public interface PluginTask
33
+ extends Task, LineDecoder.DecoderTask, TimestampParser.ParserTask
34
+ {
35
+ @Config("columns")
36
+ public SchemaConfig getSchemaConfig();
37
+
38
+ @Config("header_line") // how to set default value?? TODO @Default("true")
39
+ @ConfigDefault("false")
40
+ public boolean getHeaderLine();
41
+
42
+ @Config("delimiter")
43
+ @ConfigDefault("\",\"")
44
+ public char getDelimiterChar();
45
+
46
+ @Config("quote")
47
+ @ConfigDefault("\"\\\"\"")
48
+ public char getQuoteChar();
49
+
50
+ @Config("escape")
51
+ @ConfigDefault("\"\\\\\"")
52
+ public char getEscapeChar();
53
+
54
+ // Null value handling: if the CsvParser found 'non-quoted empty string's,
55
+ // it replaces them to string that users specified like "\N", "NULL".
56
+ @Config("null_string")
57
+ @ConfigDefault("null")
58
+ public Optional<String> getNullString();
59
+
60
+ @Config("trim_if_not_quoted")
61
+ @ConfigDefault("false")
62
+ public boolean getTrimIfNotQuoted();
63
+
64
+ @Config("max_quoted_size_limit")
65
+ @ConfigDefault("131072") //128kB
66
+ public long getMaxQuotedSizeLimit();
67
+ }
68
+
69
+ private final Logger log;
70
+
71
+ public CsvParserPlugin()
72
+ {
73
+ log = Exec.getLogger(CsvParserPlugin.class);
74
+ }
75
+
76
+ @Override
77
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
78
+ {
79
+ PluginTask task = config.loadConfig(PluginTask.class);
80
+ control.run(task.dump(), task.getSchemaConfig().toSchema());
81
+ }
82
+
83
+ private Map<Integer, TimestampParser> newTimestampParsers(
84
+ TimestampParser.ParserTask task, Schema schema)
85
+ {
86
+ ImmutableMap.Builder<Integer, TimestampParser> builder = new ImmutableMap.Builder<>();
87
+ for (Column column : schema.getColumns()) {
88
+ if (column.getType() instanceof TimestampType) {
89
+ TimestampType tt = (TimestampType) column.getType();
90
+ builder.put(column.getIndex(), new TimestampParser(tt.getFormat(), task));
91
+ }
92
+ }
93
+ return builder.build();
94
+ }
95
+
96
+ @Override
97
+ public void run(TaskSource taskSource, final Schema schema,
98
+ FileInput input, PageOutput output)
99
+ {
100
+ PluginTask task = taskSource.loadTask(PluginTask.class);
101
+ final Map<Integer, TimestampParser> timestampFormatters = newTimestampParsers(task, schema);
102
+ final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
103
+ final String nullStringOrNull = task.getNullString().orNull();
104
+ boolean skipHeaderLine = task.getHeaderLine();
105
+
106
+ try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
107
+ while (tokenizer.nextFile()) {
108
+ if (skipHeaderLine) {
109
+ // skip the first line
110
+ if (tokenizer.nextRecord()) {
111
+ for (int i=0; i < schema.getColumnCount(); i++) {
112
+ tokenizer.nextColumn(); // TODO check return value?
113
+ }
114
+ }
115
+ }
116
+
117
+ while (true) {
118
+ try {
119
+ if (!tokenizer.nextRecord()) {
120
+ break;
121
+ }
122
+
123
+ schema.visitColumns(new SchemaVisitor() {
124
+ public void booleanColumn(Column column)
125
+ {
126
+ String v = nextColumn(schema, tokenizer, nullStringOrNull);
127
+ if (v == null) {
128
+ pageBuilder.setNull(column);
129
+ } else {
130
+ pageBuilder.setBoolean(column, Boolean.parseBoolean(v));
131
+ }
132
+ }
133
+
134
+ public void longColumn(Column column)
135
+ {
136
+ String v = nextColumn(schema, tokenizer, nullStringOrNull);
137
+ if (v == null) {
138
+ pageBuilder.setNull(column);
139
+ } else {
140
+ try {
141
+ pageBuilder.setLong(column, Long.parseLong(v));
142
+ } catch (NumberFormatException e) {
143
+ // TODO support default value
144
+ throw new CsvRecordValidateException(e);
145
+ }
146
+ }
147
+ }
148
+
149
+ public void doubleColumn(Column column)
150
+ {
151
+ String v = nextColumn(schema, tokenizer, nullStringOrNull);
152
+ if (v == null) {
153
+ pageBuilder.setNull(column);
154
+ } else {
155
+ try {
156
+ pageBuilder.setDouble(column, Double.parseDouble(v));
157
+ } catch (NumberFormatException e) {
158
+ // TODO support default value
159
+ throw new CsvRecordValidateException(e);
160
+ }
161
+ }
162
+ }
163
+
164
+ public void stringColumn(Column column)
165
+ {
166
+ String v = nextColumn(schema, tokenizer, nullStringOrNull);
167
+ if (v == null) {
168
+ pageBuilder.setNull(column);
169
+ } else {
170
+ pageBuilder.setString(column, v);
171
+ }
172
+ }
173
+
174
+ public void timestampColumn(Column column)
175
+ {
176
+ String v = nextColumn(schema, tokenizer, nullStringOrNull);
177
+ if (v == null) {
178
+ pageBuilder.setNull(column);
179
+ } else {
180
+ try {
181
+ pageBuilder.setTimestamp(column, (timestampFormatters.get(column.getIndex()).parse(v)));
182
+ } catch (TimestampParseException e) {
183
+ // TODO support default value
184
+ throw new CsvRecordValidateException(e);
185
+ }
186
+ }
187
+ }
188
+ });
189
+ pageBuilder.addRecord();
190
+
191
+ } catch (Exception e) {
192
+ // TODO logging
193
+ long lineNumber = tokenizer.getCurrentLineNumber();
194
+ String skippedLine = tokenizer.skipCurrentLine();
195
+ log.warn(String.format("Skipped (line %d): %s", lineNumber, skippedLine), e);
196
+ //exec.notice().skippedLine(skippedLine);
197
+ }
198
+ }
199
+ }
200
+
201
+ pageBuilder.finish();
202
+ }
203
+ }
204
+
205
+ private static String nextColumn(Schema schema, CsvTokenizer tokenizer, String nullStringOrNull)
206
+ {
207
+ String v = tokenizer.nextColumn();
208
+ if (v == null) {
209
+ throw new RuntimeException(String.format("Expected %d columns but line %d has fewer number of columns",
210
+ schema.getColumnCount(), tokenizer.getCurrentLineNumber()));
211
+ }
212
+
213
+ if (!v.isEmpty()) {
214
+ if (v.equals(nullStringOrNull)) {
215
+ return null;
216
+ }
217
+ return v;
218
+ } else if (tokenizer.wasQuotedColumn()) {
219
+ return "";
220
+ } else {
221
+ return null;
222
+ }
223
+ }
224
+
225
+ static class CsvRecordValidateException
226
+ extends RuntimeException
227
+ {
228
+ CsvRecordValidateException(Throwable cause)
229
+ {
230
+ super(cause);
231
+ }
232
+ }
233
+ }