embulk 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +18 -0
  5. data/COPYING +14 -0
  6. data/Gemfile +2 -0
  7. data/Gemfile.lock +31 -0
  8. data/README.md +206 -0
  9. data/Rakefile +26 -0
  10. data/appveyor.yml +20 -0
  11. data/bin/embulk +106 -0
  12. data/build.gradle +338 -0
  13. data/embulk-cli/build.gradle +6 -0
  14. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +22 -0
  15. data/embulk-cli/src/main/sh/selfrun.sh +158 -0
  16. data/embulk-cli/src/test/java/org/embulk/cli/DummyMain.java +23 -0
  17. data/embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java +281 -0
  18. data/embulk-core/build.gradle +59 -0
  19. data/embulk-core/src/main/java/org/embulk/EmbulkEmbed.java +315 -0
  20. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +76 -0
  21. data/embulk-core/src/main/java/org/embulk/command/PreviewPrinter.java +84 -0
  22. data/embulk-core/src/main/java/org/embulk/command/TablePreviewPrinter.java +107 -0
  23. data/embulk-core/src/main/java/org/embulk/command/VerticalPreviewPrinter.java +47 -0
  24. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +33 -0
  25. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  26. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ConfigDiff.java +29 -0
  28. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  29. data/embulk-core/src/main/java/org/embulk/config/ConfigInject.java +14 -0
  30. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +141 -0
  31. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +31 -0
  32. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +39 -0
  33. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +231 -0
  34. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +84 -0
  35. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  36. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +123 -0
  37. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  38. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  39. data/embulk-core/src/main/java/org/embulk/config/TaskReport.java +29 -0
  40. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +345 -0
  41. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +31 -0
  42. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +38 -0
  43. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/BulkLoader.java +652 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +52 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/ExecutionInterruptedException.java +10 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java +26 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
  50. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  51. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +373 -0
  52. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutorPlugin.java +129 -0
  53. data/embulk-core/src/main/java/org/embulk/exec/LocalThreadExecutor.java +34 -0
  54. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +60 -0
  55. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  56. data/embulk-core/src/main/java/org/embulk/exec/PartialExecutionException.java +18 -0
  57. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +77 -0
  58. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +183 -0
  59. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  60. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  61. data/embulk-core/src/main/java/org/embulk/exec/ResumeState.java +100 -0
  62. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +136 -0
  63. data/embulk-core/src/main/java/org/embulk/exec/SetCurrentThreadName.java +19 -0
  64. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  65. data/embulk-core/src/main/java/org/embulk/exec/TempFileAllocator.java +35 -0
  66. data/embulk-core/src/main/java/org/embulk/guice/Bootstrap.java +157 -0
  67. data/embulk-core/src/main/java/org/embulk/guice/CloseableInjector.java +22 -0
  68. data/embulk-core/src/main/java/org/embulk/guice/InjectorProxy.java +145 -0
  69. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleInjector.java +26 -0
  70. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleInjectorProxy.java +61 -0
  71. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleManager.java +187 -0
  72. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleMethods.java +89 -0
  73. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleMethodsMap.java +38 -0
  74. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleModule.java +97 -0
  75. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +72 -0
  76. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +119 -0
  77. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  78. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +96 -0
  79. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoader.java +168 -0
  80. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderFactory.java +9 -0
  81. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderModule.java +71 -0
  82. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +78 -0
  83. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  84. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  85. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  86. data/embulk-core/src/main/java/org/embulk/plugin/compat/InputPluginWrapper.java +102 -0
  87. data/embulk-core/src/main/java/org/embulk/plugin/compat/PluginWrappers.java +30 -0
  88. data/embulk-core/src/main/java/org/embulk/plugin/compat/TransactionalFileInputWrapper.java +96 -0
  89. data/embulk-core/src/main/java/org/embulk/plugin/compat/TransactionalFileOutputWrapper.java +102 -0
  90. data/embulk-core/src/main/java/org/embulk/plugin/compat/TransactionalPageOutputWrapper.java +95 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +148 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +112 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/ColumnVisitor.java +14 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +113 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +217 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/ExecutorPlugin.java +19 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +44 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +30 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +162 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +28 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +202 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/FilterPlugin.java +18 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +33 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +29 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/Page.java +51 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +338 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +226 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/ProcessState.java +10 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/ProcessTask.java +117 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +134 -0
  123. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +93 -0
  124. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfigException.java +22 -0
  125. data/embulk-core/src/main/java/org/embulk/spi/TaskState.java +81 -0
  126. data/embulk-core/src/main/java/org/embulk/spi/TempFileException.java +19 -0
  127. data/embulk-core/src/main/java/org/embulk/spi/TempFileSpace.java +87 -0
  128. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  129. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  130. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  131. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  132. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +55 -0
  133. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  134. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  135. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  136. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +100 -0
  137. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +97 -0
  138. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +10 -0
  139. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +104 -0
  140. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +49 -0
  141. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +58 -0
  142. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  143. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  144. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  145. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  146. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +41 -0
  147. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  148. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +44 -0
  149. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  150. data/embulk-core/src/main/java/org/embulk/spi/unit/ByteSize.java +156 -0
  151. data/embulk-core/src/main/java/org/embulk/spi/unit/LocalFile.java +106 -0
  152. data/embulk-core/src/main/java/org/embulk/spi/unit/LocalFileSerDe.java +113 -0
  153. data/embulk-core/src/main/java/org/embulk/spi/unit/ToString.java +54 -0
  154. data/embulk-core/src/main/java/org/embulk/spi/unit/ToStringMap.java +34 -0
  155. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  156. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  157. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnNotFoundException.java +10 -0
  158. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetter.java +18 -0
  159. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +94 -0
  160. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +161 -0
  161. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  162. data/embulk-core/src/main/java/org/embulk/spi/util/Executors.java +95 -0
  163. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +111 -0
  164. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +119 -0
  165. data/embulk-core/src/main/java/org/embulk/spi/util/Filters.java +100 -0
  166. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +190 -0
  167. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamTransactionalFileInput.java +25 -0
  168. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +65 -0
  169. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  170. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +123 -0
  171. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  172. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  173. data/embulk-core/src/main/java/org/embulk/spi/util/OutputStreamFileOutput.java +88 -0
  174. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  175. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  176. data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +128 -0
  177. data/embulk-core/src/main/java/org/embulk/spi/util/RetryExecutor.java +130 -0
  178. data/embulk-core/src/main/java/org/embulk/spi/util/Timestamps.java +53 -0
  179. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +79 -0
  180. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/BooleanColumnSetter.java +64 -0
  181. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/DefaultValueSetter.java +18 -0
  182. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/DoubleColumnSetter.java +61 -0
  183. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/LongColumnSetter.java +69 -0
  184. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/NullDefaultValueSetter.java +34 -0
  185. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +52 -0
  186. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/StringColumnSetter.java +56 -0
  187. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/TimestampColumnSetter.java +64 -0
  188. data/embulk-core/src/main/resources/embulk/logback-color.xml +72 -0
  189. data/embulk-core/src/main/resources/embulk/logback-console.xml +14 -0
  190. data/embulk-core/src/main/resources/embulk/logback-file.xml +25 -0
  191. data/embulk-core/src/main/resources/embulk/parent_first_packages.properties +70 -0
  192. data/embulk-core/src/main/resources/embulk/parent_first_resources.properties +28 -0
  193. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +114 -0
  194. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  195. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  196. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  197. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  198. data/embulk-core/src/test/java/org/embulk/config/TestConfigLoader.java +66 -0
  199. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  200. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  201. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +58 -0
  202. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  203. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  204. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  205. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  206. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +57 -0
  207. data/embulk-core/src/test/java/org/embulk/spi/TestBuffer.java +24 -0
  208. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +89 -0
  209. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +196 -0
  210. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +207 -0
  211. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  212. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +319 -0
  213. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  214. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +63 -0
  215. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParserDeprecated.java +67 -0
  216. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  217. data/embulk-core/src/test/java/org/embulk/spi/unit/TestByteSize.java +79 -0
  218. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  219. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  220. data/embulk-docs/Makefile +178 -0
  221. data/embulk-docs/build.gradle +32 -0
  222. data/embulk-docs/make.bat +243 -0
  223. data/embulk-docs/push-gh-pages.sh +49 -0
  224. data/embulk-docs/src/_static/embulk-architecture.png +0 -0
  225. data/embulk-docs/src/_static/embulk-logo.png +0 -0
  226. data/embulk-docs/src/_static/embulk-logo.svg +133 -0
  227. data/embulk-docs/src/built-in.rst +440 -0
  228. data/embulk-docs/src/conf.py +260 -0
  229. data/embulk-docs/src/customization.rst +184 -0
  230. data/embulk-docs/src/index.rst +84 -0
  231. data/embulk-docs/src/recipe.rst +8 -0
  232. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +153 -0
  233. data/embulk-docs/src/release.rst +57 -0
  234. data/embulk-docs/src/release/release-0.1.0.rst +8 -0
  235. data/embulk-docs/src/release/release-0.2.0.rst +16 -0
  236. data/embulk-docs/src/release/release-0.2.1.rst +19 -0
  237. data/embulk-docs/src/release/release-0.3.0.rst +34 -0
  238. data/embulk-docs/src/release/release-0.3.1.rst +11 -0
  239. data/embulk-docs/src/release/release-0.3.2.rst +15 -0
  240. data/embulk-docs/src/release/release-0.4.0.rst +74 -0
  241. data/embulk-docs/src/release/release-0.4.1.rst +18 -0
  242. data/embulk-docs/src/release/release-0.4.10.rst +17 -0
  243. data/embulk-docs/src/release/release-0.4.2.rst +18 -0
  244. data/embulk-docs/src/release/release-0.4.3.rst +34 -0
  245. data/embulk-docs/src/release/release-0.4.4.rst +39 -0
  246. data/embulk-docs/src/release/release-0.4.5.rst +24 -0
  247. data/embulk-docs/src/release/release-0.4.6.rst +30 -0
  248. data/embulk-docs/src/release/release-0.4.7.rst +16 -0
  249. data/embulk-docs/src/release/release-0.4.8.rst +15 -0
  250. data/embulk-docs/src/release/release-0.4.9.rst +23 -0
  251. data/embulk-docs/src/release/release-0.5.0.rst +89 -0
  252. data/embulk-docs/src/release/release-0.5.1.rst +13 -0
  253. data/embulk-docs/src/release/release-0.5.2.rst +30 -0
  254. data/embulk-docs/src/release/release-0.5.3.rst +22 -0
  255. data/embulk-docs/src/release/release-0.5.4.rst +24 -0
  256. data/embulk-docs/src/release/release-0.5.5.rst +18 -0
  257. data/embulk-docs/src/release/release-0.6.0.rst +34 -0
  258. data/embulk-docs/src/release/release-0.6.1.rst +11 -0
  259. data/embulk-docs/src/release/release-0.6.10.rst +15 -0
  260. data/embulk-docs/src/release/release-0.6.11.rst +19 -0
  261. data/embulk-docs/src/release/release-0.6.12.rst +31 -0
  262. data/embulk-docs/src/release/release-0.6.13.rst +23 -0
  263. data/embulk-docs/src/release/release-0.6.14.rst +47 -0
  264. data/embulk-docs/src/release/release-0.6.15.rst +26 -0
  265. data/embulk-docs/src/release/release-0.6.16.rst +26 -0
  266. data/embulk-docs/src/release/release-0.6.17.rst +39 -0
  267. data/embulk-docs/src/release/release-0.6.18.rst +14 -0
  268. data/embulk-docs/src/release/release-0.6.19.rst +18 -0
  269. data/embulk-docs/src/release/release-0.6.2.rst +17 -0
  270. data/embulk-docs/src/release/release-0.6.20.rst +19 -0
  271. data/embulk-docs/src/release/release-0.6.21.rst +20 -0
  272. data/embulk-docs/src/release/release-0.6.22.rst +26 -0
  273. data/embulk-docs/src/release/release-0.6.23.rst +17 -0
  274. data/embulk-docs/src/release/release-0.6.24.rst +13 -0
  275. data/embulk-docs/src/release/release-0.6.25.rst +12 -0
  276. data/embulk-docs/src/release/release-0.6.3.rst +23 -0
  277. data/embulk-docs/src/release/release-0.6.4.rst +13 -0
  278. data/embulk-docs/src/release/release-0.6.5.rst +17 -0
  279. data/embulk-docs/src/release/release-0.6.6.rst +17 -0
  280. data/embulk-docs/src/release/release-0.6.7.rst +17 -0
  281. data/embulk-docs/src/release/release-0.6.8.rst +24 -0
  282. data/embulk-docs/src/release/release-0.6.9.rst +24 -0
  283. data/embulk-docs/src/release/release-0.7.0.rst +96 -0
  284. data/embulk-standards/build.gradle +5 -0
  285. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +284 -0
  286. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +379 -0
  287. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +411 -0
  288. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  289. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +71 -0
  290. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +203 -0
  291. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +148 -0
  292. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +59 -0
  293. data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +56 -0
  294. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  295. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +53 -0
  296. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +85 -0
  297. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  298. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java +312 -0
  299. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +75 -0
  300. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +360 -0
  301. data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +88 -0
  302. data/embulk.gemspec +39 -0
  303. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  304. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  305. data/gradlew +164 -0
  306. data/gradlew.bat +90 -0
  307. data/lib/embulk.rb +72 -0
  308. data/lib/embulk/buffer.rb +22 -0
  309. data/lib/embulk/column.rb +70 -0
  310. data/lib/embulk/command/embulk_bundle.rb +56 -0
  311. data/lib/embulk/command/embulk_example.rb +32 -0
  312. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  313. data/lib/embulk/command/embulk_main.rb +2 -0
  314. data/lib/embulk/command/embulk_migrate_plugin.rb +170 -0
  315. data/lib/embulk/command/embulk_new_plugin.rb +124 -0
  316. data/lib/embulk/command/embulk_run.rb +470 -0
  317. data/lib/embulk/command/embulk_selfupdate.rb +84 -0
  318. data/lib/embulk/data/bundle/.bundle/config +3 -0
  319. data/lib/embulk/data/bundle/.ruby-version +1 -0
  320. data/lib/embulk/data/bundle/Gemfile +26 -0
  321. data/lib/embulk/data/bundle/embulk/filter/example.rb +42 -0
  322. data/lib/embulk/data/bundle/embulk/input/example.rb +54 -0
  323. data/lib/embulk/data/bundle/embulk/output/example.rb +58 -0
  324. data/lib/embulk/data/new/LICENSE.txt +21 -0
  325. data/lib/embulk/data/new/README.md.erb +111 -0
  326. data/lib/embulk/data/new/gitignore.erb +13 -0
  327. data/lib/embulk/data/new/java/build.gradle.erb +73 -0
  328. data/lib/embulk/data/new/java/decoder.java.erb +84 -0
  329. data/lib/embulk/data/new/java/encoder.java.erb +86 -0
  330. data/lib/embulk/data/new/java/file_input.java.erb +143 -0
  331. data/lib/embulk/data/new/java/file_output.java.erb +93 -0
  332. data/lib/embulk/data/new/java/filter.java.erb +56 -0
  333. data/lib/embulk/data/new/java/formatter.java.erb +54 -0
  334. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
  335. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +6 -0
  336. data/lib/embulk/data/new/java/gradlew +164 -0
  337. data/lib/embulk/data/new/java/gradlew.bat +90 -0
  338. data/lib/embulk/data/new/java/input.java.erb +87 -0
  339. data/lib/embulk/data/new/java/output.java.erb +77 -0
  340. data/lib/embulk/data/new/java/parser.java.erb +60 -0
  341. data/lib/embulk/data/new/java/plugin_loader.rb.erb +3 -0
  342. data/lib/embulk/data/new/java/test.java.erb +5 -0
  343. data/lib/embulk/data/new/ruby/.ruby-version +1 -0
  344. data/lib/embulk/data/new/ruby/Gemfile +2 -0
  345. data/lib/embulk/data/new/ruby/Rakefile +3 -0
  346. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
  347. data/lib/embulk/data/new/ruby/filter.rb.erb +41 -0
  348. data/lib/embulk/data/new/ruby/formatter.rb.erb +49 -0
  349. data/lib/embulk/data/new/ruby/gemspec.erb +20 -0
  350. data/lib/embulk/data/new/ruby/input.rb.erb +59 -0
  351. data/lib/embulk/data/new/ruby/output.rb.erb +61 -0
  352. data/lib/embulk/data/new/ruby/parser.rb.erb +44 -0
  353. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
  354. data/lib/embulk/data/package_data.rb +50 -0
  355. data/lib/embulk/data_source.rb +220 -0
  356. data/lib/embulk/decoder_plugin.rb +27 -0
  357. data/lib/embulk/encoder_plugin.rb +27 -0
  358. data/lib/embulk/error.rb +8 -0
  359. data/lib/embulk/executor_plugin.rb +23 -0
  360. data/lib/embulk/file_input.rb +87 -0
  361. data/lib/embulk/file_input_plugin.rb +27 -0
  362. data/lib/embulk/file_output.rb +56 -0
  363. data/lib/embulk/file_output_plugin.rb +27 -0
  364. data/lib/embulk/filter_plugin.rb +105 -0
  365. data/lib/embulk/formatter_plugin.rb +105 -0
  366. data/lib/embulk/guess/charset.rb +44 -0
  367. data/lib/embulk/guess/csv.rb +327 -0
  368. data/lib/embulk/guess/gzip.rb +18 -0
  369. data/lib/embulk/guess/newline.rb +22 -0
  370. data/lib/embulk/guess/schema_guess.rb +118 -0
  371. data/lib/embulk/guess/time_format_guess.rb +394 -0
  372. data/lib/embulk/guess_plugin.rb +129 -0
  373. data/lib/embulk/input_plugin.rb +121 -0
  374. data/lib/embulk/java/bootstrap.rb +24 -0
  375. data/lib/embulk/java/imports.rb +69 -0
  376. data/lib/embulk/java/time_helper.rb +79 -0
  377. data/lib/embulk/java_plugin.rb +90 -0
  378. data/lib/embulk/logger.rb +154 -0
  379. data/lib/embulk/output_plugin.rb +150 -0
  380. data/lib/embulk/page.rb +30 -0
  381. data/lib/embulk/page_builder.rb +76 -0
  382. data/lib/embulk/parser_plugin.rb +78 -0
  383. data/lib/embulk/plugin.rb +239 -0
  384. data/lib/embulk/plugin_registry.rb +96 -0
  385. data/lib/embulk/runner.rb +184 -0
  386. data/lib/embulk/schema.rb +103 -0
  387. data/lib/embulk/version.rb +3 -0
  388. data/settings.gradle +6 -0
  389. data/test/guess/test_schema_guess.rb +11 -0
  390. data/test/guess/test_time_format_guess.rb +133 -0
  391. data/test/helper.rb +21 -0
  392. data/test/run-test.rb +14 -0
  393. metadata +566 -0
@@ -0,0 +1,379 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.collect.ImmutableSet;
5
+ import com.fasterxml.jackson.annotation.JsonCreator;
6
+ import com.fasterxml.jackson.annotation.JsonIgnore;
7
+ import com.fasterxml.jackson.annotation.JsonValue;
8
+ import org.embulk.config.Task;
9
+ import org.embulk.config.Config;
10
+ import org.embulk.config.ConfigDefault;
11
+ import org.embulk.config.ConfigSource;
12
+ import org.embulk.config.ConfigException;
13
+ import org.embulk.config.TaskSource;
14
+ import org.embulk.spi.time.TimestampParser;
15
+ import org.embulk.spi.time.TimestampParseException;
16
+ import org.embulk.spi.Column;
17
+ import org.embulk.spi.Schema;
18
+ import org.embulk.spi.SchemaConfig;
19
+ import org.embulk.spi.ColumnVisitor;
20
+ import org.embulk.spi.PageBuilder;
21
+ import org.embulk.spi.ParserPlugin;
22
+ import org.embulk.spi.Exec;
23
+ import org.embulk.spi.FileInput;
24
+ import org.embulk.spi.PageOutput;
25
+ import org.embulk.spi.util.LineDecoder;
26
+ import org.embulk.spi.util.Timestamps;
27
+ import org.slf4j.Logger;
28
+
29
+ public class CsvParserPlugin
30
+ implements ParserPlugin
31
+ {
32
+ private static final ImmutableSet<String> TRUE_STRINGS =
33
+ ImmutableSet.of(
34
+ "true", "True", "TRUE",
35
+ "yes", "Yes", "YES",
36
+ "t", "T", "y", "Y",
37
+ "on", "On", "ON",
38
+ "1");
39
+
40
+ public interface PluginTask
41
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
42
+ {
43
+ @Config("columns")
44
+ SchemaConfig getSchemaConfig();
45
+
46
+ @Config("header_line")
47
+ @ConfigDefault("null")
48
+ Optional<Boolean> getHeaderLine();
49
+
50
+ @Config("skip_header_lines")
51
+ @ConfigDefault("0")
52
+ int getSkipHeaderLines();
53
+ void setSkipHeaderLines(int n);
54
+
55
+ @Config("delimiter")
56
+ @ConfigDefault("\",\"")
57
+ char getDelimiterChar();
58
+
59
+ @Config("quote")
60
+ @ConfigDefault("\"\\\"\"")
61
+ Optional<QuoteCharacter> getQuoteChar();
62
+
63
+ @Config("escape")
64
+ @ConfigDefault("\"\\\\\"")
65
+ Optional<EscapeCharacter> getEscapeChar();
66
+
67
+ // Null value handling: if the CsvParser found 'non-quoted empty string's,
68
+ // it replaces them to string that users specified like "\N", "NULL".
69
+ @Config("null_string")
70
+ @ConfigDefault("null")
71
+ Optional<String> getNullString();
72
+
73
+ @Config("trim_if_not_quoted")
74
+ @ConfigDefault("false")
75
+ boolean getTrimIfNotQuoted();
76
+
77
+ @Config("max_quoted_size_limit")
78
+ @ConfigDefault("131072") //128kB
79
+ long getMaxQuotedSizeLimit();
80
+
81
+ @Config("comment_line_marker")
82
+ @ConfigDefault("null")
83
+ Optional<String> getCommentLineMarker();
84
+
85
+ @Config("allow_optional_columns")
86
+ @ConfigDefault("false")
87
+ boolean getAllowOptionalColumns();
88
+
89
+ @Config("allow_extra_columns")
90
+ @ConfigDefault("false")
91
+ boolean getAllowExtraColumns();
92
+ }
93
+
94
+ public static class QuoteCharacter
95
+ {
96
+ private final char character;
97
+
98
+ public QuoteCharacter(char character)
99
+ {
100
+ this.character = character;
101
+ }
102
+
103
+ public static QuoteCharacter noQuote()
104
+ {
105
+ return new QuoteCharacter(CsvTokenizer.NO_QUOTE);
106
+ }
107
+
108
+ @JsonCreator
109
+ public static QuoteCharacter ofString(String str)
110
+ {
111
+ if (str.length() >= 2) {
112
+ throw new ConfigException("\"quote\" option accepts only 1 character.");
113
+ } else if (str.isEmpty()) {
114
+ Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"quote\" option is obsoleted. Currently it becomes '\"' automatically but this behavior will be removed. Please set '\"' explicitly.");
115
+ return new QuoteCharacter('"');
116
+ } else {
117
+ return new QuoteCharacter(str.charAt(0));
118
+ }
119
+ }
120
+
121
+ @JsonIgnore
122
+ public char getCharacter()
123
+ {
124
+ return character;
125
+ }
126
+
127
+ @JsonValue
128
+ public String getOptionalString()
129
+ {
130
+ return new String(new char[] { character });
131
+ }
132
+
133
+ @Override
134
+ public boolean equals(Object obj)
135
+ {
136
+ if (!(obj instanceof QuoteCharacter)) {
137
+ return false;
138
+ }
139
+ QuoteCharacter o = (QuoteCharacter) obj;
140
+ return character == o.character;
141
+ }
142
+ }
143
+
144
+ public static class EscapeCharacter
145
+ {
146
+ private final char character;
147
+
148
+ public EscapeCharacter(char character)
149
+ {
150
+ this.character = character;
151
+ }
152
+
153
+ public static EscapeCharacter noEscape()
154
+ {
155
+ return new EscapeCharacter(CsvTokenizer.NO_ESCAPE);
156
+ }
157
+
158
+ @JsonCreator
159
+ public static EscapeCharacter ofString(String str)
160
+ {
161
+ if (str.length() >= 2) {
162
+ throw new ConfigException("\"escape\" option accepts only 1 character.");
163
+ } else if (str.isEmpty()) {
164
+ Exec.getLogger(CsvParserPlugin.class).warn("Setting '' (empty string) to \"escape\" option is obsoleted. Currently it becomes null automatically but this behavior will be removed. Please set \"escape: null\" explicitly.");
165
+ return noEscape();
166
+ } else {
167
+ return new EscapeCharacter(str.charAt(0));
168
+ }
169
+ }
170
+
171
+ @JsonIgnore
172
+ public char getCharacter()
173
+ {
174
+ return character;
175
+ }
176
+
177
+ @JsonValue
178
+ public String getOptionalString()
179
+ {
180
+ return new String(new char[] { character });
181
+ }
182
+
183
+ @Override
184
+ public boolean equals(Object obj)
185
+ {
186
+ if (!(obj instanceof EscapeCharacter)) {
187
+ return false;
188
+ }
189
+ EscapeCharacter o = (EscapeCharacter) obj;
190
+ return character == o.character;
191
+ }
192
+ }
193
+
194
+ private final Logger log;
195
+
196
+ public CsvParserPlugin()
197
+ {
198
+ log = Exec.getLogger(CsvParserPlugin.class);
199
+ }
200
+
201
+ @Override
202
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
203
+ {
204
+ PluginTask task = config.loadConfig(PluginTask.class);
205
+
206
+ // backward compatibility
207
+ if (task.getHeaderLine().isPresent()) {
208
+ if (task.getSkipHeaderLines() > 0) {
209
+ throw new ConfigException("'header_line' option is invalid if 'skip_header_lines' is set.");
210
+ }
211
+ if (task.getHeaderLine().get()) {
212
+ task.setSkipHeaderLines(1);
213
+ } else {
214
+ task.setSkipHeaderLines(0);
215
+ }
216
+ }
217
+
218
+ control.run(task.dump(), task.getSchemaConfig().toSchema());
219
+ }
220
+
221
+ @Override
222
+ public void run(TaskSource taskSource, final Schema schema,
223
+ FileInput input, PageOutput output)
224
+ {
225
+ PluginTask task = taskSource.loadTask(PluginTask.class);
226
+ final TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getSchemaConfig());
227
+ LineDecoder lineDecoder = new LineDecoder(input, task);
228
+ final CsvTokenizer tokenizer = new CsvTokenizer(lineDecoder, task);
229
+ final String nullStringOrNull = task.getNullString().orNull();
230
+ final boolean allowOptionalColumns = task.getAllowOptionalColumns();
231
+ final boolean allowExtraColumns = task.getAllowExtraColumns();
232
+ int skipHeaderLines = task.getSkipHeaderLines();
233
+
234
+ try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
235
+ while (tokenizer.nextFile()) {
236
+ // skip the header lines for each file
237
+ for (; skipHeaderLines > 0; skipHeaderLines--) {
238
+ if (lineDecoder.poll() == null) {
239
+ break;
240
+ }
241
+ }
242
+
243
+ if (!tokenizer.nextRecord()) {
244
+ // empty file
245
+ continue;
246
+ }
247
+
248
+ while (true) {
249
+ boolean hasNextRecord;
250
+
251
+ try {
252
+ schema.visitColumns(new ColumnVisitor() {
253
+ public void booleanColumn(Column column)
254
+ {
255
+ String v = nextColumn();
256
+ if (v == null) {
257
+ pageBuilder.setNull(column);
258
+ } else {
259
+ pageBuilder.setBoolean(column, TRUE_STRINGS.contains(v));
260
+ }
261
+ }
262
+
263
+ public void longColumn(Column column)
264
+ {
265
+ String v = nextColumn();
266
+ if (v == null) {
267
+ pageBuilder.setNull(column);
268
+ } else {
269
+ try {
270
+ pageBuilder.setLong(column, Long.parseLong(v));
271
+ } catch (NumberFormatException e) {
272
+ // TODO support default value
273
+ throw new CsvRecordValidateException(e);
274
+ }
275
+ }
276
+ }
277
+
278
+ public void doubleColumn(Column column)
279
+ {
280
+ String v = nextColumn();
281
+ if (v == null) {
282
+ pageBuilder.setNull(column);
283
+ } else {
284
+ try {
285
+ pageBuilder.setDouble(column, Double.parseDouble(v));
286
+ } catch (NumberFormatException e) {
287
+ // TODO support default value
288
+ throw new CsvRecordValidateException(e);
289
+ }
290
+ }
291
+ }
292
+
293
+ public void stringColumn(Column column)
294
+ {
295
+ String v = nextColumn();
296
+ if (v == null) {
297
+ pageBuilder.setNull(column);
298
+ } else {
299
+ pageBuilder.setString(column, v);
300
+ }
301
+ }
302
+
303
+ public void timestampColumn(Column column)
304
+ {
305
+ String v = nextColumn();
306
+ if (v == null) {
307
+ pageBuilder.setNull(column);
308
+ } else {
309
+ try {
310
+ pageBuilder.setTimestamp(column, timestampParsers[column.getIndex()].parse(v));
311
+ } catch (TimestampParseException e) {
312
+ // TODO support default value
313
+ throw new CsvRecordValidateException(e);
314
+ }
315
+ }
316
+ }
317
+
318
+ private String nextColumn()
319
+ {
320
+ if (allowOptionalColumns && !tokenizer.hasNextColumn()) {
321
+ //TODO warning
322
+ return null;
323
+ }
324
+ String v = tokenizer.nextColumn();
325
+ if (!v.isEmpty()) {
326
+ if (v.equals(nullStringOrNull)) {
327
+ return null;
328
+ }
329
+ return v;
330
+ } else if (tokenizer.wasQuotedColumn()) {
331
+ return "";
332
+ } else {
333
+ return null;
334
+ }
335
+ }
336
+ });
337
+
338
+ try {
339
+ hasNextRecord = tokenizer.nextRecord();
340
+ } catch (CsvTokenizer.TooManyColumnsException ex) {
341
+ if (allowExtraColumns) {
342
+ String tooManyColumnsLine = tokenizer.skipCurrentLine();
343
+ // TODO warning
344
+ hasNextRecord = tokenizer.nextRecord();
345
+ } else {
346
+ // this line will be skipped at the following catch section
347
+ throw ex;
348
+ }
349
+ }
350
+ pageBuilder.addRecord();
351
+
352
+ } catch (CsvTokenizer.InvalidFormatException | CsvTokenizer.InvalidValueException | CsvRecordValidateException e) {
353
+ long lineNumber = tokenizer.getCurrentLineNumber();
354
+ String skippedLine = tokenizer.skipCurrentLine();
355
+ log.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
356
+ //exec.notice().skippedLine(skippedLine);
357
+
358
+ hasNextRecord = tokenizer.nextRecord();
359
+ }
360
+
361
+ if (!hasNextRecord) {
362
+ break;
363
+ }
364
+ }
365
+ }
366
+
367
+ pageBuilder.finish();
368
+ }
369
+ }
370
+
371
+ static class CsvRecordValidateException
372
+ extends RuntimeException
373
+ {
374
+ CsvRecordValidateException(Throwable cause)
375
+ {
376
+ super(cause);
377
+ }
378
+ }
379
+ }
@@ -0,0 +1,411 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Preconditions;
4
+ import java.util.List;
5
+ import java.util.ArrayList;
6
+ import java.util.Deque;
7
+ import java.util.ArrayDeque;
8
+ import org.embulk.spi.util.LineDecoder;
9
+
10
+ public class CsvTokenizer
11
+ {
12
+ static enum RecordState
13
+ {
14
+ NOT_END, END,
15
+ }
16
+
17
+ static enum ColumnState
18
+ {
19
+ BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
20
+ }
21
+
22
+ private static final char END_OF_LINE = '\0';
23
+ static final char NO_QUOTE = '\0';
24
+ static final char NO_ESCAPE = '\0';
25
+
26
+ private final char delimiter;
27
+ private final char quote;
28
+ private final char escape;
29
+ private final String newline;
30
+ private final boolean trimIfNotQuoted;
31
+ private final long maxQuotedSizeLimit;
32
+ private final String commentLineMarker;
33
+ private final LineDecoder input;
34
+
35
+ private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
36
+ private long lineNumber = 0;
37
+
38
+ private String line = null;
39
+ private int linePos = 0;
40
+ private boolean wasQuotedColumn = false;
41
+ private List<String> quotedValueLines = new ArrayList<>();
42
+ private Deque<String> unreadLines = new ArrayDeque<>();
43
+
44
+ public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
45
+ {
46
+ delimiter = task.getDelimiterChar();
47
+ quote = task.getQuoteChar().or(CsvParserPlugin.QuoteCharacter.noQuote()).getCharacter();
48
+ escape = task.getEscapeChar().or(CsvParserPlugin.EscapeCharacter.noEscape()).getCharacter();
49
+ newline = task.getNewline().getString();
50
+ trimIfNotQuoted = task.getTrimIfNotQuoted();
51
+ maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
52
+ commentLineMarker = task.getCommentLineMarker().orNull();
53
+ this.input = input;
54
+ }
55
+
56
+ public long getCurrentLineNumber()
57
+ {
58
+ // returns actual line number. Internally, lineNumber starts at 0.
59
+ return lineNumber + 1;
60
+ }
61
+
62
+ // returns skipped line
63
+ public String skipCurrentLine()
64
+ {
65
+ String skippedLine;
66
+ if (quotedValueLines.isEmpty()) {
67
+ skippedLine = line;
68
+ } else {
69
+ // recover lines of quoted value
70
+ skippedLine = quotedValueLines.remove(0); // TODO optimize performance
71
+ unreadLines.addAll(quotedValueLines);
72
+ unreadLines.add(line);
73
+ lineNumber -= quotedValueLines.size();
74
+ quotedValueLines.clear();
75
+ }
76
+ recordState = RecordState.END;
77
+ return skippedLine;
78
+ }
79
+
80
+ public boolean nextFile()
81
+ {
82
+ return input.nextFile();
83
+ }
84
+
85
+ // used by guess-csv
86
+ public boolean nextRecord()
87
+ {
88
+ return nextRecord(true);
89
+ }
90
+
91
+ public boolean nextRecord(boolean skipEmptyLine)
92
+ {
93
+ // If at the end of record, read the next line and initialize the state
94
+ if (recordState != RecordState.END) {
95
+ throw new TooManyColumnsException("Too many columns");
96
+ }
97
+
98
+ boolean hasNext = nextLine(skipEmptyLine);
99
+ if (hasNext) {
100
+ recordState = RecordState.NOT_END;
101
+ return true;
102
+ } else {
103
+ return false;
104
+ }
105
+ }
106
+
107
+ private boolean nextLine(boolean skipEmptyLine)
108
+ {
109
+ while (true) {
110
+ if (!unreadLines.isEmpty()) {
111
+ line = unreadLines.removeFirst();
112
+ } else {
113
+ line = input.poll();
114
+ if (line == null) {
115
+ return false;
116
+ }
117
+ }
118
+ linePos = 0;
119
+ lineNumber++;
120
+
121
+ boolean skip = skipEmptyLine && (
122
+ line.isEmpty() ||
123
+ (commentLineMarker != null && line.startsWith(commentLineMarker)));
124
+ if (!skip) {
125
+ return true;
126
+ }
127
+ }
128
+ }
129
+
130
+ public boolean hasNextColumn()
131
+ {
132
+ return recordState == RecordState.NOT_END;
133
+ }
134
+
135
+ public String nextColumn()
136
+ {
137
+ if (!hasNextColumn()) {
138
+ throw new TooFewColumnsException("Too few columns");
139
+ }
140
+
141
+ // reset last state
142
+ wasQuotedColumn = false;
143
+ quotedValueLines.clear();
144
+
145
+ // local state
146
+ int valueStartPos = linePos;
147
+ int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
148
+ StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
149
+ ColumnState columnState = ColumnState.BEGIN;
150
+
151
+ while (true) {
152
+ final char c = nextChar();
153
+
154
+ switch (columnState) {
155
+ case BEGIN:
156
+ // TODO optimization: state is BEGIN only at the first character of a column.
157
+ // this block can be out of the looop.
158
+ if (isDelimiter(c)) {
159
+ // empty value
160
+ return "";
161
+
162
+ } else if (isEndOfLine(c)) {
163
+ // empty value
164
+ recordState = RecordState.END;
165
+ return "";
166
+
167
+ } else if (isSpace(c) && trimIfNotQuoted) {
168
+ columnState = ColumnState.FIRST_TRIM;
169
+
170
+ } else if (isQuote(c)) {
171
+ valueStartPos = linePos; // == 1
172
+ wasQuotedColumn = true;
173
+ quotedValue = new StringBuilder();
174
+ columnState = ColumnState.QUOTED_VALUE;
175
+
176
+ } else {
177
+ columnState = ColumnState.VALUE;
178
+ }
179
+ break;
180
+
181
+ case FIRST_TRIM:
182
+ if (isDelimiter(c)) {
183
+ // empty value
184
+ return "";
185
+
186
+ } else if (isEndOfLine(c)) {
187
+ // empty value
188
+ recordState = RecordState.END;
189
+ return "";
190
+
191
+ } else if (isQuote(c)) {
192
+ // column has heading spaces and quoted. TODO should this be rejected?
193
+ valueStartPos = linePos;
194
+ wasQuotedColumn = true;
195
+ quotedValue = new StringBuilder();
196
+ columnState = ColumnState.QUOTED_VALUE;
197
+
198
+ } else if (isSpace(c)) {
199
+ // skip this character
200
+
201
+ } else {
202
+ valueStartPos = linePos - 1;
203
+ columnState = ColumnState.VALUE;
204
+ }
205
+ break;
206
+
207
+ case VALUE:
208
+ if (isDelimiter(c)) {
209
+ return line.substring(valueStartPos, linePos - 1);
210
+
211
+ } else if (isEndOfLine(c)) {
212
+ recordState = RecordState.END;
213
+ return line.substring(valueStartPos, linePos);
214
+
215
+ } else if (isSpace(c) && trimIfNotQuoted) {
216
+ valueEndPos = linePos - 1; // this is possibly end of value
217
+ columnState = ColumnState.LAST_TRIM_OR_VALUE;
218
+
219
+ // TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
220
+ //} else if (isQuote(c)) {
221
+ // // In RFC4180, If fields are not enclosed with double quotes, then
222
+ // // double quotes may not appear inside the fields. But they are often
223
+ // // included in the fields. We should care about them later.
224
+
225
+ } else {
226
+ // keep VALUE state
227
+ }
228
+ break;
229
+
230
+ case LAST_TRIM_OR_VALUE:
231
+ if (isDelimiter(c)) {
232
+ return line.substring(valueStartPos, valueEndPos);
233
+
234
+ } else if (isEndOfLine(c)) {
235
+ recordState = RecordState.END;
236
+ return line.substring(valueStartPos, valueEndPos);
237
+
238
+ } else if (isSpace(c)) {
239
+ // keep LAST_TRIM_OR_VALUE state
240
+
241
+ } else {
242
+ // this spaces are not trailing spaces. go back to VALUE state
243
+ columnState = ColumnState.BEGIN;
244
+ }
245
+ break;
246
+
247
+ case QUOTED_VALUE:
248
+ if (isEndOfLine(c)) {
249
+ // multi-line quoted value
250
+ quotedValue.append(line.substring(valueStartPos, linePos));
251
+ quotedValue.append(newline);
252
+ quotedValueLines.add(line);
253
+ if (!nextLine(false)) {
254
+ throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
255
+ }
256
+ valueStartPos = 0;
257
+
258
+ } else if (isQuote(c)) {
259
+ char next = peekNextChar();
260
+ if (isQuote(next)) { // escaped quote
261
+ quotedValue.append(line.substring(valueStartPos, linePos));
262
+ valueStartPos = ++linePos;
263
+ } else {
264
+ quotedValue.append(line.substring(valueStartPos, linePos - 1));
265
+ columnState = ColumnState.AFTER_QUOTED_VALUE;
266
+ }
267
+
268
+ } else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
269
+ // In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
270
+ char next = peekNextChar();
271
+ if (isEndOfLine(c)) {
272
+ // escape end of line. TODO assuming multi-line quoted value without newline?
273
+ quotedValue.append(line.substring(valueStartPos, linePos));
274
+ quotedValueLines.add(line);
275
+ if (!nextLine(false)) {
276
+ throw new InvalidValueException("Unexpected end of line during parsing a quoted value");
277
+ }
278
+ valueStartPos = 0;
279
+ } else if (isQuote(next) || isEscape(next)) { // escaped quote
280
+ quotedValue.append(line.substring(valueStartPos, linePos - 1));
281
+ quotedValue.append(next);
282
+ valueStartPos = ++linePos;
283
+ }
284
+
285
+ } else {
286
+ if ((linePos - valueStartPos) + quotedValue.length() > maxQuotedSizeLimit) {
287
+ throw new QuotedSizeLimitExceededException("The size of the quoted value exceeds the limit size ("+maxQuotedSizeLimit+")");
288
+ }
289
+ // keep QUOTED_VALUE state
290
+ }
291
+ break;
292
+
293
+ case AFTER_QUOTED_VALUE:
294
+ if (isDelimiter(c)) {
295
+ return quotedValue.toString();
296
+
297
+ } else if (isEndOfLine(c)) {
298
+ recordState = RecordState.END;
299
+ return quotedValue.toString();
300
+
301
+ } else if (isSpace(c)) {
302
+ // column has trailing spaces and quoted. TODO should this be rejected?
303
+
304
+ } else {
305
+ throw new InvalidValueException(String.format("Unexpected extra character (%c) after quoted value in %s", c, line));
306
+ }
307
+ break;
308
+
309
+ default:
310
+ assert false;
311
+ }
312
+ }
313
+ }
314
+
315
+ public boolean wasQuotedColumn()
316
+ {
317
+ return wasQuotedColumn;
318
+ }
319
+
320
+ private char nextChar()
321
+ {
322
+ Preconditions.checkState(line != null, "nextColumn is called after end of file");
323
+
324
+ if (linePos >= line.length()) {
325
+ return END_OF_LINE;
326
+ } else {
327
+ return line.charAt(linePos++);
328
+ }
329
+ }
330
+
331
+ private char peekNextChar()
332
+ {
333
+ Preconditions.checkState(line != null, "peekNextChar is called after end of file");
334
+
335
+ if (linePos >= line.length()) {
336
+ return END_OF_LINE;
337
+ } else {
338
+ return line.charAt(linePos);
339
+ }
340
+ }
341
+
342
+ private boolean isSpace(char c)
343
+ {
344
+ return c == ' ';
345
+ }
346
+
347
+ private boolean isDelimiter(char c)
348
+ {
349
+ return c == delimiter;
350
+ }
351
+
352
+ private boolean isEndOfLine(char c)
353
+ {
354
+ return c == END_OF_LINE;
355
+ }
356
+
357
+ private boolean isQuote(char c)
358
+ {
359
+ return quote != NO_QUOTE && c == quote;
360
+ }
361
+
362
+ private boolean isEscape(char c)
363
+ {
364
+ return escape != NO_ESCAPE && c == escape;
365
+ }
366
+
367
+ public static class InvalidFormatException
368
+ extends RuntimeException
369
+ {
370
+ public InvalidFormatException(String message)
371
+ {
372
+ super(message);
373
+ }
374
+ }
375
+
376
+ public static class InvalidValueException
377
+ extends RuntimeException
378
+ {
379
+ public InvalidValueException(String message)
380
+ {
381
+ super(message);
382
+ }
383
+ }
384
+
385
+ public static class QuotedSizeLimitExceededException
386
+ extends InvalidValueException
387
+ {
388
+ public QuotedSizeLimitExceededException(String message)
389
+ {
390
+ super(message);
391
+ }
392
+ }
393
+
394
+ public class TooManyColumnsException
395
+ extends InvalidFormatException
396
+ {
397
+ public TooManyColumnsException(String message)
398
+ {
399
+ super(message);
400
+ }
401
+ }
402
+
403
+ public class TooFewColumnsException
404
+ extends InvalidFormatException
405
+ {
406
+ public TooFewColumnsException(String message)
407
+ {
408
+ super(message);
409
+ }
410
+ }
411
+ }