embulk 0.7.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (393) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.ruby-version +1 -0
  4. data/.travis.yml +18 -0
  5. data/COPYING +14 -0
  6. data/Gemfile +2 -0
  7. data/Gemfile.lock +31 -0
  8. data/README.md +206 -0
  9. data/Rakefile +26 -0
  10. data/appveyor.yml +20 -0
  11. data/bin/embulk +106 -0
  12. data/build.gradle +338 -0
  13. data/embulk-cli/build.gradle +6 -0
  14. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +22 -0
  15. data/embulk-cli/src/main/sh/selfrun.sh +158 -0
  16. data/embulk-cli/src/test/java/org/embulk/cli/DummyMain.java +23 -0
  17. data/embulk-cli/src/test/java/org/embulk/cli/SelfrunTest.java +281 -0
  18. data/embulk-core/build.gradle +59 -0
  19. data/embulk-core/src/main/java/org/embulk/EmbulkEmbed.java +315 -0
  20. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +76 -0
  21. data/embulk-core/src/main/java/org/embulk/command/PreviewPrinter.java +84 -0
  22. data/embulk-core/src/main/java/org/embulk/command/TablePreviewPrinter.java +107 -0
  23. data/embulk-core/src/main/java/org/embulk/command/VerticalPreviewPrinter.java +47 -0
  24. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +33 -0
  25. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  26. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ConfigDiff.java +29 -0
  28. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  29. data/embulk-core/src/main/java/org/embulk/config/ConfigInject.java +14 -0
  30. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +141 -0
  31. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +31 -0
  32. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +39 -0
  33. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +231 -0
  34. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +84 -0
  35. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  36. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +123 -0
  37. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  38. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  39. data/embulk-core/src/main/java/org/embulk/config/TaskReport.java +29 -0
  40. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +345 -0
  41. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +31 -0
  42. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +38 -0
  43. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/BulkLoader.java +652 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +52 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/ExecutionInterruptedException.java +10 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java +26 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/ForGuess.java +16 -0
  50. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  51. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +373 -0
  52. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutorPlugin.java +129 -0
  53. data/embulk-core/src/main/java/org/embulk/exec/LocalThreadExecutor.java +34 -0
  54. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +60 -0
  55. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  56. data/embulk-core/src/main/java/org/embulk/exec/PartialExecutionException.java +18 -0
  57. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +77 -0
  58. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +183 -0
  59. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  60. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  61. data/embulk-core/src/main/java/org/embulk/exec/ResumeState.java +100 -0
  62. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +136 -0
  63. data/embulk-core/src/main/java/org/embulk/exec/SetCurrentThreadName.java +19 -0
  64. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  65. data/embulk-core/src/main/java/org/embulk/exec/TempFileAllocator.java +35 -0
  66. data/embulk-core/src/main/java/org/embulk/guice/Bootstrap.java +157 -0
  67. data/embulk-core/src/main/java/org/embulk/guice/CloseableInjector.java +22 -0
  68. data/embulk-core/src/main/java/org/embulk/guice/InjectorProxy.java +145 -0
  69. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleInjector.java +26 -0
  70. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleInjectorProxy.java +61 -0
  71. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleManager.java +187 -0
  72. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleMethods.java +89 -0
  73. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleMethodsMap.java +38 -0
  74. data/embulk-core/src/main/java/org/embulk/guice/LifeCycleModule.java +97 -0
  75. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +72 -0
  76. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +119 -0
  77. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  78. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +96 -0
  79. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoader.java +168 -0
  80. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderFactory.java +9 -0
  81. data/embulk-core/src/main/java/org/embulk/plugin/PluginClassLoaderModule.java +71 -0
  82. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +78 -0
  83. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  84. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  85. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  86. data/embulk-core/src/main/java/org/embulk/plugin/compat/InputPluginWrapper.java +102 -0
  87. data/embulk-core/src/main/java/org/embulk/plugin/compat/PluginWrappers.java +30 -0
  88. data/embulk-core/src/main/java/org/embulk/plugin/compat/TransactionalFileInputWrapper.java +96 -0
  89. data/embulk-core/src/main/java/org/embulk/plugin/compat/TransactionalFileOutputWrapper.java +102 -0
  90. data/embulk-core/src/main/java/org/embulk/plugin/compat/TransactionalPageOutputWrapper.java +95 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +148 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +112 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/ColumnVisitor.java +14 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +113 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +217 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/ExecutorPlugin.java +19 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +44 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +30 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +162 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +28 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +202 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/FilterPlugin.java +18 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +33 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +29 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/Page.java +51 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +338 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +226 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/ProcessState.java +10 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/ProcessTask.java +117 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +134 -0
  123. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +93 -0
  124. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfigException.java +22 -0
  125. data/embulk-core/src/main/java/org/embulk/spi/TaskState.java +81 -0
  126. data/embulk-core/src/main/java/org/embulk/spi/TempFileException.java +19 -0
  127. data/embulk-core/src/main/java/org/embulk/spi/TempFileSpace.java +87 -0
  128. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  129. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  130. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  131. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  132. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +55 -0
  133. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  134. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  135. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  136. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +100 -0
  137. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +97 -0
  138. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +10 -0
  139. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +104 -0
  140. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +49 -0
  141. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +58 -0
  142. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  143. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  144. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  145. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  146. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +41 -0
  147. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  148. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +44 -0
  149. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  150. data/embulk-core/src/main/java/org/embulk/spi/unit/ByteSize.java +156 -0
  151. data/embulk-core/src/main/java/org/embulk/spi/unit/LocalFile.java +106 -0
  152. data/embulk-core/src/main/java/org/embulk/spi/unit/LocalFileSerDe.java +113 -0
  153. data/embulk-core/src/main/java/org/embulk/spi/unit/ToString.java +54 -0
  154. data/embulk-core/src/main/java/org/embulk/spi/unit/ToStringMap.java +34 -0
  155. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  156. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  157. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnNotFoundException.java +10 -0
  158. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetter.java +18 -0
  159. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java +94 -0
  160. data/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java +161 -0
  161. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  162. data/embulk-core/src/main/java/org/embulk/spi/util/Executors.java +95 -0
  163. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +111 -0
  164. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +119 -0
  165. data/embulk-core/src/main/java/org/embulk/spi/util/Filters.java +100 -0
  166. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +190 -0
  167. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamTransactionalFileInput.java +25 -0
  168. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +65 -0
  169. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  170. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +123 -0
  171. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  172. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  173. data/embulk-core/src/main/java/org/embulk/spi/util/OutputStreamFileOutput.java +88 -0
  174. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  175. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  176. data/embulk-core/src/main/java/org/embulk/spi/util/ResumableInputStream.java +128 -0
  177. data/embulk-core/src/main/java/org/embulk/spi/util/RetryExecutor.java +130 -0
  178. data/embulk-core/src/main/java/org/embulk/spi/util/Timestamps.java +53 -0
  179. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java +79 -0
  180. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/BooleanColumnSetter.java +64 -0
  181. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/DefaultValueSetter.java +18 -0
  182. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/DoubleColumnSetter.java +61 -0
  183. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/LongColumnSetter.java +69 -0
  184. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/NullDefaultValueSetter.java +34 -0
  185. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/SkipColumnSetter.java +52 -0
  186. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/StringColumnSetter.java +56 -0
  187. data/embulk-core/src/main/java/org/embulk/spi/util/dynamic/TimestampColumnSetter.java +64 -0
  188. data/embulk-core/src/main/resources/embulk/logback-color.xml +72 -0
  189. data/embulk-core/src/main/resources/embulk/logback-console.xml +14 -0
  190. data/embulk-core/src/main/resources/embulk/logback-file.xml +25 -0
  191. data/embulk-core/src/main/resources/embulk/parent_first_packages.properties +70 -0
  192. data/embulk-core/src/main/resources/embulk/parent_first_resources.properties +28 -0
  193. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +114 -0
  194. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  195. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  196. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  197. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  198. data/embulk-core/src/test/java/org/embulk/config/TestConfigLoader.java +66 -0
  199. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  200. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  201. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +58 -0
  202. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  203. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  204. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  205. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  206. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +57 -0
  207. data/embulk-core/src/test/java/org/embulk/spi/TestBuffer.java +24 -0
  208. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +89 -0
  209. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +196 -0
  210. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +207 -0
  211. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  212. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +319 -0
  213. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  214. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +63 -0
  215. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParserDeprecated.java +67 -0
  216. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  217. data/embulk-core/src/test/java/org/embulk/spi/unit/TestByteSize.java +79 -0
  218. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  219. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  220. data/embulk-docs/Makefile +178 -0
  221. data/embulk-docs/build.gradle +32 -0
  222. data/embulk-docs/make.bat +243 -0
  223. data/embulk-docs/push-gh-pages.sh +49 -0
  224. data/embulk-docs/src/_static/embulk-architecture.png +0 -0
  225. data/embulk-docs/src/_static/embulk-logo.png +0 -0
  226. data/embulk-docs/src/_static/embulk-logo.svg +133 -0
  227. data/embulk-docs/src/built-in.rst +440 -0
  228. data/embulk-docs/src/conf.py +260 -0
  229. data/embulk-docs/src/customization.rst +184 -0
  230. data/embulk-docs/src/index.rst +84 -0
  231. data/embulk-docs/src/recipe.rst +8 -0
  232. data/embulk-docs/src/recipe/scheduled-csv-load-to-elasticsearch-kibana4.rst +153 -0
  233. data/embulk-docs/src/release.rst +57 -0
  234. data/embulk-docs/src/release/release-0.1.0.rst +8 -0
  235. data/embulk-docs/src/release/release-0.2.0.rst +16 -0
  236. data/embulk-docs/src/release/release-0.2.1.rst +19 -0
  237. data/embulk-docs/src/release/release-0.3.0.rst +34 -0
  238. data/embulk-docs/src/release/release-0.3.1.rst +11 -0
  239. data/embulk-docs/src/release/release-0.3.2.rst +15 -0
  240. data/embulk-docs/src/release/release-0.4.0.rst +74 -0
  241. data/embulk-docs/src/release/release-0.4.1.rst +18 -0
  242. data/embulk-docs/src/release/release-0.4.10.rst +17 -0
  243. data/embulk-docs/src/release/release-0.4.2.rst +18 -0
  244. data/embulk-docs/src/release/release-0.4.3.rst +34 -0
  245. data/embulk-docs/src/release/release-0.4.4.rst +39 -0
  246. data/embulk-docs/src/release/release-0.4.5.rst +24 -0
  247. data/embulk-docs/src/release/release-0.4.6.rst +30 -0
  248. data/embulk-docs/src/release/release-0.4.7.rst +16 -0
  249. data/embulk-docs/src/release/release-0.4.8.rst +15 -0
  250. data/embulk-docs/src/release/release-0.4.9.rst +23 -0
  251. data/embulk-docs/src/release/release-0.5.0.rst +89 -0
  252. data/embulk-docs/src/release/release-0.5.1.rst +13 -0
  253. data/embulk-docs/src/release/release-0.5.2.rst +30 -0
  254. data/embulk-docs/src/release/release-0.5.3.rst +22 -0
  255. data/embulk-docs/src/release/release-0.5.4.rst +24 -0
  256. data/embulk-docs/src/release/release-0.5.5.rst +18 -0
  257. data/embulk-docs/src/release/release-0.6.0.rst +34 -0
  258. data/embulk-docs/src/release/release-0.6.1.rst +11 -0
  259. data/embulk-docs/src/release/release-0.6.10.rst +15 -0
  260. data/embulk-docs/src/release/release-0.6.11.rst +19 -0
  261. data/embulk-docs/src/release/release-0.6.12.rst +31 -0
  262. data/embulk-docs/src/release/release-0.6.13.rst +23 -0
  263. data/embulk-docs/src/release/release-0.6.14.rst +47 -0
  264. data/embulk-docs/src/release/release-0.6.15.rst +26 -0
  265. data/embulk-docs/src/release/release-0.6.16.rst +26 -0
  266. data/embulk-docs/src/release/release-0.6.17.rst +39 -0
  267. data/embulk-docs/src/release/release-0.6.18.rst +14 -0
  268. data/embulk-docs/src/release/release-0.6.19.rst +18 -0
  269. data/embulk-docs/src/release/release-0.6.2.rst +17 -0
  270. data/embulk-docs/src/release/release-0.6.20.rst +19 -0
  271. data/embulk-docs/src/release/release-0.6.21.rst +20 -0
  272. data/embulk-docs/src/release/release-0.6.22.rst +26 -0
  273. data/embulk-docs/src/release/release-0.6.23.rst +17 -0
  274. data/embulk-docs/src/release/release-0.6.24.rst +13 -0
  275. data/embulk-docs/src/release/release-0.6.25.rst +12 -0
  276. data/embulk-docs/src/release/release-0.6.3.rst +23 -0
  277. data/embulk-docs/src/release/release-0.6.4.rst +13 -0
  278. data/embulk-docs/src/release/release-0.6.5.rst +17 -0
  279. data/embulk-docs/src/release/release-0.6.6.rst +17 -0
  280. data/embulk-docs/src/release/release-0.6.7.rst +17 -0
  281. data/embulk-docs/src/release/release-0.6.8.rst +24 -0
  282. data/embulk-docs/src/release/release-0.6.9.rst +24 -0
  283. data/embulk-docs/src/release/release-0.7.0.rst +96 -0
  284. data/embulk-standards/build.gradle +5 -0
  285. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +284 -0
  286. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +379 -0
  287. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +411 -0
  288. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  289. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +71 -0
  290. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +203 -0
  291. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +148 -0
  292. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +59 -0
  293. data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +56 -0
  294. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  295. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +53 -0
  296. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +85 -0
  297. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  298. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvFormatterPlugin.java +312 -0
  299. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +75 -0
  300. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +360 -0
  301. data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +88 -0
  302. data/embulk.gemspec +39 -0
  303. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  304. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  305. data/gradlew +164 -0
  306. data/gradlew.bat +90 -0
  307. data/lib/embulk.rb +72 -0
  308. data/lib/embulk/buffer.rb +22 -0
  309. data/lib/embulk/column.rb +70 -0
  310. data/lib/embulk/command/embulk_bundle.rb +56 -0
  311. data/lib/embulk/command/embulk_example.rb +32 -0
  312. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  313. data/lib/embulk/command/embulk_main.rb +2 -0
  314. data/lib/embulk/command/embulk_migrate_plugin.rb +170 -0
  315. data/lib/embulk/command/embulk_new_plugin.rb +124 -0
  316. data/lib/embulk/command/embulk_run.rb +470 -0
  317. data/lib/embulk/command/embulk_selfupdate.rb +84 -0
  318. data/lib/embulk/data/bundle/.bundle/config +3 -0
  319. data/lib/embulk/data/bundle/.ruby-version +1 -0
  320. data/lib/embulk/data/bundle/Gemfile +26 -0
  321. data/lib/embulk/data/bundle/embulk/filter/example.rb +42 -0
  322. data/lib/embulk/data/bundle/embulk/input/example.rb +54 -0
  323. data/lib/embulk/data/bundle/embulk/output/example.rb +58 -0
  324. data/lib/embulk/data/new/LICENSE.txt +21 -0
  325. data/lib/embulk/data/new/README.md.erb +111 -0
  326. data/lib/embulk/data/new/gitignore.erb +13 -0
  327. data/lib/embulk/data/new/java/build.gradle.erb +73 -0
  328. data/lib/embulk/data/new/java/decoder.java.erb +84 -0
  329. data/lib/embulk/data/new/java/encoder.java.erb +86 -0
  330. data/lib/embulk/data/new/java/file_input.java.erb +143 -0
  331. data/lib/embulk/data/new/java/file_output.java.erb +93 -0
  332. data/lib/embulk/data/new/java/filter.java.erb +56 -0
  333. data/lib/embulk/data/new/java/formatter.java.erb +54 -0
  334. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.jar +0 -0
  335. data/lib/embulk/data/new/java/gradle/wrapper/gradle-wrapper.properties +6 -0
  336. data/lib/embulk/data/new/java/gradlew +164 -0
  337. data/lib/embulk/data/new/java/gradlew.bat +90 -0
  338. data/lib/embulk/data/new/java/input.java.erb +87 -0
  339. data/lib/embulk/data/new/java/output.java.erb +77 -0
  340. data/lib/embulk/data/new/java/parser.java.erb +60 -0
  341. data/lib/embulk/data/new/java/plugin_loader.rb.erb +3 -0
  342. data/lib/embulk/data/new/java/test.java.erb +5 -0
  343. data/lib/embulk/data/new/ruby/.ruby-version +1 -0
  344. data/lib/embulk/data/new/ruby/Gemfile +2 -0
  345. data/lib/embulk/data/new/ruby/Rakefile +3 -0
  346. data/lib/embulk/data/new/ruby/decoder_guess.rb.erb +25 -0
  347. data/lib/embulk/data/new/ruby/filter.rb.erb +41 -0
  348. data/lib/embulk/data/new/ruby/formatter.rb.erb +49 -0
  349. data/lib/embulk/data/new/ruby/gemspec.erb +20 -0
  350. data/lib/embulk/data/new/ruby/input.rb.erb +59 -0
  351. data/lib/embulk/data/new/ruby/output.rb.erb +61 -0
  352. data/lib/embulk/data/new/ruby/parser.rb.erb +44 -0
  353. data/lib/embulk/data/new/ruby/parser_guess.rb.erb +65 -0
  354. data/lib/embulk/data/package_data.rb +50 -0
  355. data/lib/embulk/data_source.rb +220 -0
  356. data/lib/embulk/decoder_plugin.rb +27 -0
  357. data/lib/embulk/encoder_plugin.rb +27 -0
  358. data/lib/embulk/error.rb +8 -0
  359. data/lib/embulk/executor_plugin.rb +23 -0
  360. data/lib/embulk/file_input.rb +87 -0
  361. data/lib/embulk/file_input_plugin.rb +27 -0
  362. data/lib/embulk/file_output.rb +56 -0
  363. data/lib/embulk/file_output_plugin.rb +27 -0
  364. data/lib/embulk/filter_plugin.rb +105 -0
  365. data/lib/embulk/formatter_plugin.rb +105 -0
  366. data/lib/embulk/guess/charset.rb +44 -0
  367. data/lib/embulk/guess/csv.rb +327 -0
  368. data/lib/embulk/guess/gzip.rb +18 -0
  369. data/lib/embulk/guess/newline.rb +22 -0
  370. data/lib/embulk/guess/schema_guess.rb +118 -0
  371. data/lib/embulk/guess/time_format_guess.rb +394 -0
  372. data/lib/embulk/guess_plugin.rb +129 -0
  373. data/lib/embulk/input_plugin.rb +121 -0
  374. data/lib/embulk/java/bootstrap.rb +24 -0
  375. data/lib/embulk/java/imports.rb +69 -0
  376. data/lib/embulk/java/time_helper.rb +79 -0
  377. data/lib/embulk/java_plugin.rb +90 -0
  378. data/lib/embulk/logger.rb +154 -0
  379. data/lib/embulk/output_plugin.rb +150 -0
  380. data/lib/embulk/page.rb +30 -0
  381. data/lib/embulk/page_builder.rb +76 -0
  382. data/lib/embulk/parser_plugin.rb +78 -0
  383. data/lib/embulk/plugin.rb +239 -0
  384. data/lib/embulk/plugin_registry.rb +96 -0
  385. data/lib/embulk/runner.rb +184 -0
  386. data/lib/embulk/schema.rb +103 -0
  387. data/lib/embulk/version.rb +3 -0
  388. data/settings.gradle +6 -0
  389. data/test/guess/test_schema_guess.rb +11 -0
  390. data/test/guess/test_time_format_guess.rb +133 -0
  391. data/test/helper.rb +21 -0
  392. data/test/run-test.rb +14 -0
  393. metadata +566 -0
@@ -0,0 +1,27 @@
1
+ module Embulk
2
+
3
+ class FileInputPlugin
4
+ # TODO transaction, resume, cleanup
5
+ # TODO run
6
+
7
+ if Embulk.java?
8
+ # TODO to_java
9
+
10
+ def self.from_java(java_class)
11
+ JavaPlugin.ruby_adapter_class(java_class, FileInputPlugin, RubyAdapter)
12
+ end
13
+
14
+ module RubyAdapter
15
+ module ClassMethods
16
+ def new_java
17
+ Java::FileInputRunner.new(Java.injector.getInstance(java_class))
18
+ end
19
+ # TODO transaction, resume, cleanup
20
+ end
21
+
22
+ # TODO run
23
+ end
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,56 @@
1
+
2
+ module Embulk
3
+ require 'embulk/buffer'
4
+
5
+ class FileOutput
6
+ def initialize(java_file_output)
7
+ @java_file_output = java_file_output
8
+ @buffer = Buffer.new
9
+ @buffer.force_encoding('ASCII-8BIT')
10
+ @flush_size = 32*1024
11
+ end
12
+
13
+ def next_file
14
+ flush
15
+ @java_file_output.nextFile
16
+ self
17
+ end
18
+
19
+ def write(buffer)
20
+ buffer.force_encoding('ASCII-8BIT') # TODO this is destructively change buffer
21
+ @buffer << buffer
22
+ if @buffer.size > @flush_size
23
+ flush
24
+ end
25
+ nil
26
+ end
27
+
28
+ def add(buffer)
29
+ flush
30
+ @java_file_output.add(Buffer.from_ruby_string(buffer))
31
+ nil
32
+ end
33
+
34
+ def flush
35
+ unless @buffer.empty?
36
+ @java_file_output.add(@buffer.to_java)
37
+ @buffer.clear
38
+ end
39
+ nil
40
+ end
41
+
42
+ def finish
43
+ flush
44
+ @java_file_output.finish
45
+ end
46
+
47
+ def close
48
+ @java_file_output.close
49
+ end
50
+
51
+ def to_java
52
+ @java_file_output
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,27 @@
1
+ module Embulk
2
+
3
+ class FileOutputPlugin
4
+ # TODO transaction, resume, cleanup
5
+ # TODO add, finish, close, abort, commit
6
+
7
+ if Embulk.java?
8
+ # TODO to_java
9
+
10
+ def self.from_java(java_class)
11
+ JavaPlugin.ruby_adapter_class(java_class, FileOutputPlugin, RubyAdapter)
12
+ end
13
+
14
+ module RubyAdapter
15
+ module ClassMethods
16
+ def new_java
17
+ Java::FileOutputRunner.new(Java.injector.getInstance(java_class))
18
+ end
19
+ # TODO transaction, resume, cleanup
20
+ end
21
+
22
+ # TODO add, finish, close, abort, commit
23
+ end
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,105 @@
1
+ module Embulk
2
+
3
+ require 'embulk/data_source'
4
+ require 'embulk/schema'
5
+ require 'embulk/page'
6
+ require 'embulk/page_builder'
7
+
8
+ class FilterPlugin
9
+ def self.transaction(config, in_schema, &control)
10
+ yield(config)
11
+ return {}
12
+ end
13
+
14
+ def initialize(task, in_schema, out_schema, page_builder)
15
+ @task = task
16
+ @in_schema = in_schema
17
+ @out_schema = out_schema
18
+ @page_builder = page_builder
19
+ init
20
+ end
21
+
22
+ attr_reader :task, :in_schema, :out_schema, :page_builder
23
+
24
+ def init
25
+ end
26
+
27
+ def add(page)
28
+ raise NotImplementedError, "FilterPlugin#add(page) must be implemented"
29
+ end
30
+
31
+ def finish
32
+ end
33
+
34
+ def close
35
+ end
36
+
37
+ if Embulk.java?
38
+ def self.new_java
39
+ JavaAdapter.new(self)
40
+ end
41
+
42
+ class JavaAdapter
43
+ include Java::FilterPlugin
44
+
45
+ def initialize(ruby_class)
46
+ @ruby_class = ruby_class
47
+ end
48
+
49
+ def transaction(java_config, java_in_schema, java_control)
50
+ config = DataSource.from_java(java_config)
51
+ in_schema = Schema.from_java(java_in_schema)
52
+ @ruby_class.transaction(config, in_schema) do |task_source_hash, out_columns|
53
+ java_task_source = DataSource.from_ruby_hash(task_source_hash).to_java
54
+ java_out_schemas = Schema.new(out_columns).to_java
55
+ java_control.run(java_task_source, java_out_schemas)
56
+ end
57
+ nil
58
+ end
59
+
60
+ def open(java_task_source, java_in_schema, java_out_schema, java_output)
61
+ task_source = DataSource.from_java(java_task_source)
62
+ in_schema = Schema.from_java(java_in_schema)
63
+ out_schema = Schema.from_java(java_out_schema)
64
+ page_builder = PageBuilder.new(out_schema, java_output)
65
+ ruby_object = @ruby_class.new(task_source, in_schema, out_schema, page_builder)
66
+ return OutputAdapter.new(ruby_object, in_schema, page_builder)
67
+ end
68
+
69
+ class OutputAdapter
70
+ include Java::TransactionalPageOutput
71
+
72
+ def initialize(ruby_object, in_schema, page_builder)
73
+ @ruby_object = ruby_object
74
+ @in_schema = in_schema
75
+ @page_builder = page_builder
76
+ end
77
+
78
+ def add(java_page)
79
+ @ruby_object.add Page.new(java_page, @in_schema)
80
+ end
81
+
82
+ def finish
83
+ @ruby_object.finish
84
+ end
85
+
86
+ def close
87
+ @ruby_object.close
88
+ ensure
89
+ @page_builder.close
90
+ end
91
+ end
92
+ end
93
+
94
+ def self.from_java(java_class)
95
+ JavaPlugin.ruby_adapter_class(java_class, FilterPlugin, RubyAdapter)
96
+ end
97
+
98
+ module RubyAdapter
99
+ module ClassMethods
100
+ end
101
+ # TODO
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,105 @@
1
+ module Embulk
2
+
3
+ require 'embulk/data_source'
4
+ require 'embulk/schema'
5
+ require 'embulk/page'
6
+ require 'embulk/file_output'
7
+
8
+ class FormatterPlugin
9
+ def self.transaction(config, schema, &control)
10
+ yield(config)
11
+ return {}
12
+ end
13
+
14
+ def initialize(task, schema, file_output)
15
+ @task = task
16
+ @schema = schema
17
+ @file_output = file_output
18
+ init
19
+ end
20
+
21
+ attr_reader :task, :schema, :file_output
22
+
23
+ def init
24
+ end
25
+
26
+ def add(page)
27
+ raise NotImplementedError, "FormatterPlugin#add(page) must be implemented"
28
+ end
29
+
30
+ def finish
31
+ end
32
+
33
+ def close
34
+ end
35
+
36
+ if Embulk.java?
37
+ def self.new_java
38
+ JavaAdapter.new(self)
39
+ end
40
+
41
+ class JavaAdapter
42
+ include Java::FormatterPlugin
43
+
44
+ def initialize(ruby_class)
45
+ @ruby_class = ruby_class
46
+ end
47
+
48
+ def transaction(java_config, java_schema, java_control)
49
+ config = DataSource.from_java(java_config)
50
+ schema = Schema.from_java(java_schema)
51
+ @ruby_class.transaction(config, schema) do |task_source_hash|
52
+ java_task_source = DataSource.from_ruby_hash(task_source_hash).to_java
53
+ java_control.run(java_task_source)
54
+ end
55
+ nil
56
+ end
57
+
58
+ def open(java_task_source, java_schema, java_file_output)
59
+ task_source = DataSource.from_java(java_task_source)
60
+ schema = Schema.from_java(java_schema)
61
+ file_output = FileOutput.new(java_file_output)
62
+ ruby_object = @ruby_class.new(task_source, schema, file_output)
63
+ return OutputAdapter.new(ruby_object, schema, file_output)
64
+ end
65
+
66
+ class OutputAdapter
67
+ include Java::TransactionalPageOutput
68
+
69
+ def initialize(ruby_object, schema, file_output)
70
+ @ruby_object = ruby_object
71
+ @schema = schema
72
+ @file_output = file_output
73
+ end
74
+
75
+ def add(java_page)
76
+ # TODO reuse page reader
77
+ @ruby_object.add Page.new(java_page, @schema)
78
+ end
79
+
80
+ def finish
81
+ @ruby_object.finish
82
+ end
83
+
84
+ def close
85
+ @ruby_object.close
86
+ ensure
87
+ @file_output.close
88
+ end
89
+ end
90
+ end
91
+
92
+ def self.from_java(java_class)
93
+ JavaPlugin.ruby_adapter_class(java_class, FormatterPlugin, RubyAdapter)
94
+ end
95
+
96
+ module RubyAdapter
97
+ module ClassMethods
98
+ # TODO transaction, resume, cleanup
99
+ end
100
+ # TODO add, finish, close
101
+ end
102
+ end
103
+ end
104
+
105
+ end
@@ -0,0 +1,44 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class CharsetGuessPlugin < GuessPlugin
5
+ Plugin.register_guess('charset', self)
6
+
7
+ STATIC_MAPPING = {
8
+ # ISO-8859-1 means ASCII which is a subset of UTF-8 in most of cases
9
+ # due to lack of sample data set.
10
+ "ISO-8859-1" => "UTF-8",
11
+
12
+ # Shift_JIS is used almost only by Windows that uses "CP932" in fact.
13
+ # And "CP932" called by Microsoft actually means "MS932" in Java.
14
+ "Shift_JIS" => "MS932",
15
+ }
16
+
17
+ def guess(config, sample_buffer)
18
+ # ICU4J
19
+ begin
20
+ detector_class = com.ibm.icu.text.CharsetDetector
21
+ rescue NameError
22
+ # icu4j is removed from embulk.gem package explicitly at embulk.gemspec
23
+ # if gem is packaged for JRuby to reduce binary size. Instead, if it's
24
+ # packaged for JRuby, embulk.gemspec adds rjack-icu to its dependency.
25
+ require 'rjack-icu'
26
+ detector_class = com.ibm.icu.text.CharsetDetector
27
+ end
28
+ detector = detector_class.new
29
+ detector.setText(sample_buffer.to_java_bytes)
30
+ best_match = detector.detect
31
+ if best_match.getConfidence < 50
32
+ name = "UTF-8"
33
+ else
34
+ name = best_match.getName
35
+ if mapped_name = STATIC_MAPPING[name]
36
+ name = mapped_name
37
+ end
38
+ end
39
+ return {"parser" => {"charset" => name}}
40
+ end
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,327 @@
1
+ module Embulk
2
+ module Guess
3
+ require 'embulk/guess/schema_guess'
4
+
5
+ class CsvGuessPlugin < LineGuessPlugin
6
+ Plugin.register_guess('csv', self)
7
+
8
+ DELIMITER_CANDIDATES = [
9
+ ",", "\t", "|"
10
+ ]
11
+
12
+ QUOTE_CANDIDATES = [
13
+ "\"", "'"
14
+ ]
15
+
16
+ ESCAPE_CANDIDATES = [
17
+ "\\", '"'
18
+ ]
19
+
20
+ NULL_STRING_CANDIDATES = [
21
+ "null",
22
+ "NULL",
23
+ "#N/A",
24
+ "\\N", # MySQL LOAD, Hive STORED AS TEXTFILE
25
+ ]
26
+
27
+ COMMENT_LINE_MARKER_CANDIDATES = [
28
+ "#",
29
+ "//",
30
+ ]
31
+
32
+ MAX_SKIP_LINES = 10
33
+ NO_SKIP_DETECT_LINES = 10
34
+
35
+ def guess_lines(config, sample_lines)
36
+ return {} unless config.fetch("parser", {}).fetch("type", "csv") == "csv"
37
+
38
+ parser_config = config["parser"] || {}
39
+ if parser_config["type"] == "csv" && parser_config["delimiter"]
40
+ delim = parser_config["delimiter"]
41
+ else
42
+ delim = guess_delimiter(sample_lines)
43
+ unless delim
44
+ # not CSV file
45
+ return {}
46
+ end
47
+ end
48
+
49
+ parser_guessed = DataSource.new.merge(parser_config).merge({"type" => "csv", "delimiter" => delim})
50
+
51
+ unless parser_guessed.has_key?("quote")
52
+ quote = guess_quote(sample_lines, delim)
53
+ unless quote
54
+ if !guess_force_no_quote(sample_lines, delim, '"')
55
+ # assuming CSV follows RFC for quoting
56
+ quote = '"'
57
+ else
58
+ # disable quoting (set null)
59
+ end
60
+ end
61
+ parser_guessed["quote"] = quote
62
+ end
63
+ parser_guessed["quote"] = '"' if parser_guessed["quote"] == '' # setting '' is not allowed any more. this line converts obsoleted config syntax to explicit syntax.
64
+
65
+ unless parser_guessed.has_key?("escape")
66
+ if quote = parser_guessed["quote"]
67
+ escape = guess_escape(sample_lines, delim, quote)
68
+ unless escape
69
+ if quote == '"'
70
+ # assuming this CSV follows RFC for escaping
71
+ escape = '"'
72
+ else
73
+ # disable escaping (set null)
74
+ end
75
+ parser_guessed["escape"] = escape
76
+ end
77
+ else
78
+ # escape does nothing if quote is disabled
79
+ end
80
+ end
81
+
82
+ unless parser_guessed.has_key?("null_string")
83
+ null_string = guess_null_string(sample_lines, delim)
84
+ parser_guessed["null_string"] = null_string if null_string
85
+ # don't even set null_string to avoid confusion of null and 'null' in YAML format
86
+ end
87
+
88
+ # guessing skip_header_lines should be before guessing guess_comment_line_marker
89
+ # because lines supplied to CsvTokenizer already don't include skipped header lines.
90
+ # skipping empty lines is also disabled here because skipping header lines is done by
91
+ # CsvParser which doesn't skip empty lines automatically
92
+ sample_records = split_lines(parser_guessed, false, sample_lines, delim, {})
93
+ skip_header_lines = guess_skip_header_lines(sample_records)
94
+ sample_lines = sample_lines[skip_header_lines..-1]
95
+ sample_records = sample_records[skip_header_lines..-1]
96
+
97
+ unless parser_guessed.has_key?("comment_line_marker")
98
+ comment_line_marker, sample_lines =
99
+ guess_comment_line_marker(sample_lines, delim, parser_guessed["quote"], parser_guessed["null_string"])
100
+ if comment_line_marker
101
+ parser_guessed["comment_line_marker"] = comment_line_marker
102
+ end
103
+ end
104
+
105
+ sample_records = split_lines(parser_guessed, true, sample_lines, delim, {})
106
+
107
+ first_types = SchemaGuess.types_from_array_records(sample_records[0, 1])
108
+ other_types = SchemaGuess.types_from_array_records(sample_records[1..-1] || [])
109
+
110
+ if first_types.size <= 1 || other_types.size <= 1
111
+ # guess failed
112
+ return {}
113
+ end
114
+
115
+ unless parser_guessed.has_key?("trim_if_not_quoted")
116
+ sample_records_trimmed = split_lines(parser_guessed, true, sample_lines, delim, {"trim_if_not_quoted" => true})
117
+ other_types_trimmed = SchemaGuess.types_from_array_records(sample_records_trimmed[1..-1] || [])
118
+ if other_types != other_types_trimmed
119
+ parser_guessed["trim_if_not_quoted"] = true
120
+ other_types = other_types_trimmed
121
+ else
122
+ parser_guessed["trim_if_not_quoted"] = false
123
+ end
124
+ end
125
+
126
+ header_line = (first_types != other_types && first_types.all? {|t| ["string", "boolean"].include?(t) })
127
+
128
+ if header_line
129
+ parser_guessed["skip_header_lines"] = skip_header_lines + 1
130
+ else
131
+ parser_guessed["skip_header_lines"] = skip_header_lines
132
+ end
133
+
134
+ parser_guessed["allow_extra_columns"] = false unless parser_guessed.has_key?("allow_extra_columns")
135
+ parser_guessed["allow_optional_columns"] = false unless parser_guessed.has_key?("allow_optional_columns")
136
+
137
+ if header_line
138
+ column_names = sample_records.first
139
+ else
140
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
141
+ end
142
+ schema = []
143
+ column_names.zip(other_types).each do |name,type|
144
+ if name && type
145
+ if type.is_a?(SchemaGuess::TimestampTypeMatch)
146
+ schema << {"name" => name, "type" => type, "format" => type.format}
147
+ else
148
+ schema << {"name" => name, "type" => type}
149
+ end
150
+ end
151
+ end
152
+ parser_guessed["columns"] = schema
153
+
154
+ return {"parser" => parser_guessed}
155
+ end
156
+
157
+ private
158
+
159
+ def split_lines(parser_config, skip_empty_lines, sample_lines, delim, extra_config)
160
+ null_string = parser_config["null_string"]
161
+ config = parser_config.merge(extra_config).merge({"charset" => "UTF-8", "columns" => []})
162
+ parser_task = config.load_config(org.embulk.standards.CsvParserPlugin::PluginTask)
163
+ data = sample_lines.map {|line| line.force_encoding('UTF-8') }.join(parser_task.getNewline.getString.encode('UTF-8'))
164
+ sample = Buffer.from_ruby_string(data)
165
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.to_java]]), parser_task)
166
+ tokenizer = org.embulk.standards.CsvTokenizer.new(decoder, parser_task)
167
+ rows = []
168
+ while tokenizer.nextFile
169
+ while tokenizer.nextRecord(skip_empty_lines)
170
+ begin
171
+ columns = []
172
+ while true
173
+ begin
174
+ column = tokenizer.nextColumn
175
+ quoted = tokenizer.wasQuotedColumn
176
+ if null_string && !quoted && column == null_string
177
+ column = nil
178
+ end
179
+ columns << column
180
+ rescue org.embulk.standards.CsvTokenizer::TooFewColumnsException
181
+ rows << columns
182
+ break
183
+ end
184
+ end
185
+ rescue org.embulk.standards.CsvTokenizer::InvalidValueException
186
+ # TODO warning
187
+ tokenizer.skipCurrentLine
188
+ end
189
+ end
190
+ end
191
+ return rows
192
+ rescue
193
+ # TODO warning if fallback to this ad-hoc implementation
194
+ sample_lines.map {|line| line.split(delim) }
195
+ end
196
+
197
+ def guess_delimiter(sample_lines)
198
+ delim_weights = DELIMITER_CANDIDATES.map do |d|
199
+ counts = sample_lines.map {|line| line.count(d) }
200
+ total = array_sum(counts)
201
+ if total > 0
202
+ stddev = array_standard_deviation(counts)
203
+ stddev = 0.000000001 if stddev == 0.0
204
+ weight = total / stddev
205
+ [d, weight]
206
+ else
207
+ [nil, 0]
208
+ end
209
+ end
210
+
211
+ delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
212
+ if delim != nil && weight > 1
213
+ return delim
214
+ else
215
+ return nil
216
+ end
217
+ end
218
+
219
+ def guess_quote(sample_lines, delim)
220
+ delim_regexp = Regexp.escape(delim)
221
+ quote_weights = QUOTE_CANDIDATES.map do |q|
222
+ weights = sample_lines.map do |line|
223
+ q_regexp = Regexp.escape(q)
224
+ count = line.count(q)
225
+ if count > 0
226
+ weight = count
227
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
228
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
229
+ weight
230
+ else
231
+ nil
232
+ end
233
+ end.compact
234
+ weights.empty? ? 0 : array_avg(weights)
235
+ end
236
+ quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
237
+ if weight >= 10.0
238
+ return quote
239
+ else
240
+ return nil
241
+ end
242
+ end
243
+
244
+ def guess_force_no_quote(sample_lines, delim, quote_candidate)
245
+ delim_regexp = Regexp.escape(delim)
246
+ q_regexp = Regexp.escape(quote_candidate)
247
+ sample_lines.any? do |line|
248
+ # quoting character appear at the middle of a non-quoted value
249
+ line =~ /(?:\A|#{delim_regexp})\s*[^#{q_regexp}]+#{q_regexp}/
250
+ end
251
+ end
252
+
253
+ def guess_escape(sample_lines, delim, quote)
254
+ guessed = ESCAPE_CANDIDATES.map do |str|
255
+ regexp = /#{Regexp.quote(str)}(?:#{Regexp.quote(delim)}|#{Regexp.quote(quote)})/
256
+ counts = sample_lines.map {|line| line.scan(regexp).count }
257
+ count = counts.inject(0) {|r,c| r + c }
258
+ [str, count]
259
+ end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
260
+ found = guessed.first
261
+ return found ? found[0] : nil
262
+ end
263
+
264
+ def guess_null_string(sample_lines, delim)
265
+ guessed = NULL_STRING_CANDIDATES.map do |str|
266
+ regexp = /(?:^|#{Regexp.quote(delim)})#{Regexp.quote(str)}(?:$|#{Regexp.quote(delim)})/
267
+ counts = sample_lines.map {|line| line.scan(regexp).count }
268
+ count = counts.inject(0) {|r,c| r + c }
269
+ [str, count]
270
+ end.select {|str,count| count > 0 }.sort_by {|str,count| -count }
271
+ found_str, found_count = guessed.first
272
+ return found_str ? found_str : nil
273
+ end
274
+
275
+ def guess_skip_header_lines(sample_records)
276
+ counts = sample_records.map {|records| records.size }
277
+ (1..[MAX_SKIP_LINES, counts.length - 1].min).each do |i|
278
+ check_row_count = counts[i-1]
279
+ if counts[i, NO_SKIP_DETECT_LINES].all? {|c| c <= check_row_count }
280
+ return i - 1
281
+ end
282
+ end
283
+ return 0
284
+ end
285
+
286
+ def guess_comment_line_marker(sample_lines, delim, quote, null_string)
287
+ exclude = []
288
+ exclude << /^#{Regexp.escape(quote)}/ if quote && !quote.empty?
289
+ exclude << /^#{Regexp.escape(null_string)}(?:#{Regexp.escape(delim)}|$)/ if null_string
290
+
291
+ guessed = COMMENT_LINE_MARKER_CANDIDATES.map do |str|
292
+ regexp = /^#{Regexp.quote(str)}/
293
+ unmatch_lines = sample_lines.reject do |line|
294
+ exclude.all? {|ex| line !~ ex } && line =~ regexp
295
+ end
296
+ match_count = sample_lines.size - unmatch_lines.size
297
+ [str, match_count, unmatch_lines]
298
+ end.select {|str,match_count,unmatch_lines| match_count > 0 }.sort_by {|str,match_count,unmatch_lines| -match_count }
299
+
300
+ str, match_count, unmatch_lines = guessed.first
301
+ if str
302
+ return str, unmatch_lines
303
+ else
304
+ return nil, sample_lines
305
+ end
306
+ end
307
+
308
+ def array_sum(array)
309
+ array.inject(0) {|r,i| r += i }
310
+ end
311
+
312
+ def array_avg(array)
313
+ array.inject(0.0) {|r,i| r += i } / array.size
314
+ end
315
+
316
+ def array_variance(array)
317
+ avg = array_avg(array)
318
+ array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
319
+ end
320
+
321
+ def array_standard_deviation(array)
322
+ Math.sqrt(array_variance(array))
323
+ end
324
+ end
325
+
326
+ end
327
+ end