embulk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,51 @@
1
+ module Embulk
2
+
3
+ class OutputExample < OutputPlugin
4
+ # output plugin file name must be: embulk/output_<name>.rb
5
+ Plugin.register_output('example', self)
6
+
7
+ def self.transaction(config, schema, processor_count, &control)
8
+ task = {
9
+ 'message' => config.param('message', :string, default: "record")
10
+ }
11
+
12
+ puts "Example output started."
13
+ commit_reports = yield(task)
14
+ puts "Example output finished. Commit reports = #{commit_reports.to_json}"
15
+
16
+ return {}
17
+ end
18
+
19
+ def initialize(task, schema, index)
20
+ puts "Example output thread #{index}..."
21
+ super
22
+ @message = task.param('message', :string)
23
+ @records = 0
24
+ end
25
+
26
+ def close
27
+ end
28
+
29
+ def add(page)
30
+ page.each do |record|
31
+ hash = Hash[schema.names.zip(record)]
32
+ puts "#{@message}: #{hash.to_json}"
33
+ @records += 1
34
+ end
35
+ end
36
+
37
+ def finish
38
+ end
39
+
40
+ def abort
41
+ end
42
+
43
+ def commit
44
+ commit_report = {
45
+ "records" => @records
46
+ }
47
+ return commit_report
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,66 @@
1
+ module Embulk
2
+ require 'json'
3
+
4
+ class DataSource < Hash
5
+ def param(key, type, options={})
6
+ if self.has_key?(key)
7
+ v = self[key]
8
+ value =
9
+ case type
10
+ when :integer
11
+ Integer(v)
12
+ when :float
13
+ Float(v)
14
+ when :string
15
+ String(v)
16
+ when :bool
17
+ !!v # TODO validation
18
+ when :hash
19
+ raise ArgumentError, "Invalid value for :hash" unless v.is_a?(Hash)
20
+ v
21
+ when :array
22
+ raise ArgumentError, "Invalid value for :array" unless v.is_a?(Array)
23
+ v
24
+ else
25
+ unless type.respond_to?(:load)
26
+ raise ArgumentError, "Unknown type #{type.to_s.dump}"
27
+ end
28
+ type.load(v)
29
+ end
30
+
31
+ elsif options.has_key?(:default)
32
+ value = options[:default]
33
+
34
+ else
35
+ raise "Required field #{key.to_s.dump} is not set"
36
+ end
37
+
38
+ return value
39
+ end
40
+
41
+ if Embulk.java?
42
+ def self.from_java_object(java_data_source_impl)
43
+ json = java_data_source_impl.toString
44
+ new.merge!(JSON.parse(json))
45
+ end
46
+
47
+ def self.from_ruby_hash(hash)
48
+ new.merge!(hash)
49
+ end
50
+
51
+ def java_object
52
+ json = to_json
53
+ Java::Injected::ModelManager.readObject(Java::DataSourceImpl.java_class, json.to_java)
54
+ end
55
+
56
+ def load_config(task_type)
57
+ Java::Injected::ModelManager.readObjectWithConfigSerDe(task_type.java_class, to_json.to_java)
58
+ end
59
+
60
+ def load_task(task_type)
61
+ Java::Injected::ModelManager.readObject(task_type.java_class, to_json.to_java)
62
+ end
63
+ end
64
+ end
65
+
66
+ end
@@ -0,0 +1,5 @@
1
+
2
+ module Embulk
3
+ class ConfigError < StandardError
4
+ end
5
+ end
@@ -0,0 +1,26 @@
1
+ module Embulk
2
+
3
+ class GuessCharset < GuessPlugin
4
+ Plugin.register_guess('charset', self)
5
+
6
+ def guess(config, sample_buffer)
7
+ # ICU4J
8
+ detector = com.ibm.icu.text.CharsetDetector.new
9
+ detector.setText(sample_buffer.to_java_bytes)
10
+ best_match = detector.detect
11
+ if best_match.getConfidence < 50
12
+ name = "UTF-8"
13
+ else
14
+ name = best_match.getName
15
+ if name == "ISO-8859-1"
16
+ # ISO-8859-1 means ASCII which is a subset
17
+ # of UTF-8 in most of cases due to lack of
18
+ # sample data set
19
+ name = "UTF-8"
20
+ end
21
+ end
22
+ return {"parser" => {"charset" => name}}
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,195 @@
1
+ module Embulk
2
+ require_relative 'time_format_guess'
3
+
4
+ class GuessCsv < LineGuessPlugin
5
+ Plugin.register_guess('csv', self)
6
+
7
+ DELIMITER_CANDIDATES = [
8
+ ",", "\t", "|"
9
+ ]
10
+
11
+ QUOTE_CANDIDATES = [
12
+ "\"", "'"
13
+ ]
14
+
15
+ def guess_lines(config, sample_lines)
16
+ delim = guess_delimiter(sample_lines)
17
+ unless delim
18
+ # not CSV file
19
+ return {}
20
+ end
21
+
22
+ parser_config = config["parser"] || {}
23
+ parser_guessed = {"type" => "csv", "delimiter" => delim}
24
+
25
+ quote = guess_quote(sample_lines, delim)
26
+ parser_guessed["quote"] = quote ? quote : ''
27
+
28
+ sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
29
+ first_types = guess_field_types(sample_records[0, 1])
30
+ other_types = guess_field_types(sample_records[1..-1])
31
+
32
+ if first_types.size <= 1 || other_types.size <= 1
33
+ # guess failed
34
+ return {}
35
+ end
36
+
37
+ unless parser_config.has_key?("header_line")
38
+ parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
39
+ end
40
+
41
+ unless parser_config.has_key?("columns")
42
+ if parser_guessed["header_line"] || parser_config["header_line"]
43
+ column_names = sample_records.first
44
+ else
45
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
46
+ end
47
+ schema = []
48
+ column_names.zip(other_types).each do |name,(type,format)|
49
+ if name && type
50
+ if format
51
+ schema << {"name" => name, "type" => type, "format" => format}
52
+ else
53
+ schema << {"name" => name, "type" => type}
54
+ end
55
+ end
56
+ end
57
+ parser_guessed["columns"] = schema
58
+ end
59
+
60
+ return {"parser" => parser_guessed}
61
+ end
62
+
63
+ private
64
+
65
+ def guess_delimiter(sample_lines)
66
+ delim_weights = DELIMITER_CANDIDATES.map do |d|
67
+ counts = sample_lines.map {|line| line.count(d) }
68
+ total = array_sum(counts)
69
+ if total > 0
70
+ stddev = array_standard_deviation(counts)
71
+ stddev = 0.000000001 if stddev == 0.0
72
+ weight = total / stddev
73
+ [d, weight]
74
+ else
75
+ [nil, 0]
76
+ end
77
+ end
78
+
79
+ delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
80
+ if delim != nil && weight > 1
81
+ return delim
82
+ else
83
+ return nil
84
+ end
85
+ end
86
+
87
+ def guess_quote(sample_lines, delim)
88
+ delim_regexp = Regexp.escape(delim)
89
+ quote_weights = QUOTE_CANDIDATES.map do |q|
90
+ weights = sample_lines.map do |line|
91
+ q_regexp = Regexp.escape(q)
92
+ count = line.count(q)
93
+ if count > 0
94
+ weight = count
95
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
96
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
97
+ weight
98
+ else
99
+ nil
100
+ end
101
+ end.compact
102
+ weights.empty? ? 0 : array_avg(weights)
103
+ end
104
+ quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
105
+ if weight >= 10.0
106
+ return quote
107
+ else
108
+ return nil
109
+ end
110
+ end
111
+
112
+ def guess_field_types(field_lines)
113
+ column_lines = []
114
+ field_lines.each do |fields|
115
+ fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
116
+ end
117
+ columns = column_lines.map do |types|
118
+ t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
119
+ if t.is_a?(TimestampMatch)
120
+ format = TimeFormatGuess.guess(types.map {|type| type.text })
121
+ ["timestamp", format]
122
+ else
123
+ [t]
124
+ end
125
+ end
126
+ return columns
127
+ end
128
+
129
+ TYPE_COALESCE = Hash[{
130
+ long: :double,
131
+ boolean: :long,
132
+ }.map {|k,v|
133
+ [[k.to_s, v.to_s].sort, v.to_s]
134
+ }]
135
+
136
+ def merge_type(type1, type2)
137
+ if type1 == type2
138
+ type1
139
+ elsif type1.nil? || type2.nil?
140
+ type1 || type2
141
+ else
142
+ TYPE_COALESCE[[type1, type2].sort] || "string"
143
+ end
144
+ end
145
+
146
+ class TimestampMatch < String
147
+ def initialize(text)
148
+ super("timestamp")
149
+ @text = text
150
+ end
151
+ attr_reader :text
152
+ end
153
+
154
+ def guess_type(str)
155
+ if ["true", "false"].include?(str)
156
+ return "boolean"
157
+ end
158
+
159
+ if TimeFormatGuess.guess(str)
160
+ return TimestampMatch.new(str)
161
+ end
162
+
163
+ if str.to_i.to_s == str
164
+ return "long"
165
+ end
166
+
167
+ if str.include?('.')
168
+ a, b = str.split(".", 2)
169
+ if a.to_i.to_s == a && b.to_i.to_s == b
170
+ return "double"
171
+ end
172
+ end
173
+
174
+ return "string"
175
+ end
176
+
177
+ def array_sum(array)
178
+ array.inject(0) {|r,i| r += i }
179
+ end
180
+
181
+ def array_avg(array)
182
+ array.inject(0.0) {|r,i| r += i } / array.size
183
+ end
184
+
185
+ def array_variance(array)
186
+ avg = array_avg(array)
187
+ array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
188
+ end
189
+
190
+ def array_standard_deviation(array)
191
+ Math.sqrt(array_variance(array))
192
+ end
193
+ end
194
+
195
+ end
@@ -0,0 +1,18 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class GzipGuess < GuessPlugin
5
+ Plugin.register_guess('gzip', self)
6
+
7
+ GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
+
9
+ def guess(config, sample_buffer)
10
+ if sample_buffer[0,2] == GZIP_HEADER
11
+ return {"decoders" => [{"type" => "gzip"}]}
12
+ end
13
+ return {}
14
+ end
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,20 @@
1
+ module Embulk
2
+
3
+ class GuessNewline < TextGuessPlugin
4
+ Plugin.register_guess('newline', self)
5
+
6
+ def guess_text(config, sample_text)
7
+ cr_count = sample_text.count("\r")
8
+ lf_count = sample_text.count("\n")
9
+ crlf_count = sample_text.scan(/\r\n/).length
10
+ if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
11
+ return {"parser" => {"newline" => "CRLF"}}
12
+ elsif cr_count > lf_count / 2
13
+ return {"parser" => {"newline" => "CR"}}
14
+ else
15
+ return {"parser" => {"newline" => "LF"}}
16
+ end
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,113 @@
1
+ module Embulk
2
+
3
+ class GuessPlugin
4
+ def guess(config, sample_buffer)
5
+ raise NotImplementedError, "GuessPlugin#guess(config, sample_buffer) must be implemented"
6
+ end
7
+
8
+ if Embulk.java?
9
+ def self.java_object
10
+ JavaAdapter.new(new)
11
+ end
12
+
13
+ def self.from_java_object(java_guess)
14
+ RubyAdapter.new(java_guess)
15
+ end
16
+
17
+ class RubyAdapter < Embulk::GuessPlugin
18
+ def initialized(java_guess)
19
+ @java_guess = java_guess
20
+ end
21
+
22
+ def guess(config, sample)
23
+ java_config = config.java_object
24
+ java_sample = sample.java_object
25
+ java_next_config = @java_guess.guess(java_config, java_sample)
26
+ return DataSource.from_java_object(java_next_config)
27
+ end
28
+
29
+ def java_object
30
+ @java_guess
31
+ end
32
+ end
33
+
34
+ class JavaAdapter
35
+ include Java::GuessPlugin
36
+
37
+ def initialize(ruby_guess)
38
+ @ruby_guess = ruby_guess
39
+ end
40
+
41
+ def guess(java_config, java_sample)
42
+ config = DataSource.from_java_object(java_config)
43
+ sample = Buffer.from_java_object(java_sample)
44
+ next_config_hash = @ruby_guess.guess(config, sample)
45
+ return DataSource.from_ruby_hash(next_config_hash).java_object
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ class TextGuessPlugin < GuessPlugin
52
+ def guess(config, sample)
53
+ # TODO pure-ruby LineDecoder implementation?
54
+ begin
55
+ task = config.load_config(Java::LineDecoder::DecoderTask)
56
+ rescue
57
+ # TODO log?
58
+ p $!
59
+ p $!.backtrace
60
+ return DataSource.new
61
+ end
62
+
63
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.java_object]]), task)
64
+ sample_text = ''
65
+ while decoder.nextFile
66
+ first = true
67
+ while line = decoder.poll
68
+ if first
69
+ first = false
70
+ else
71
+ sample_text << task.getNewline().getString()
72
+ end
73
+ sample_text << line
74
+ end
75
+ end
76
+
77
+ return guess_text(config, sample_text);
78
+ end
79
+
80
+ def guess_text(config, sample_text)
81
+ raise NotImplementedError, "TextGuessPlugin#guess_text(config, sample_text) must be implemented"
82
+ end
83
+ end
84
+
85
+ class LineGuessPlugin < GuessPlugin
86
+ def guess(config, sample)
87
+ # TODO pure-ruby LineDecoder implementation?
88
+ begin
89
+ task = config.load_config(Java::LineDecoder::DecoderTask)
90
+ rescue
91
+ # TODO log?
92
+ p $!
93
+ p $!.backtrace
94
+ return DataSource.new
95
+ end
96
+
97
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.java_object]]), task)
98
+ sample_lines = []
99
+ while decoder.nextFile
100
+ while line = decoder.poll
101
+ sample_lines << line
102
+ end
103
+ end
104
+
105
+ return guess_lines(config, sample_lines);
106
+ end
107
+
108
+ def guess_lines(config, sample_lines)
109
+ raise NotImplementedError, "LineGuessPlugin#guess_lines(config, sample_lines) must be implemented"
110
+ end
111
+ end
112
+
113
+ end