embulk 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,51 @@
1
+ module Embulk
2
+
3
+ class OutputExample < OutputPlugin
4
+ # output plugin file name must be: embulk/output_<name>.rb
5
+ Plugin.register_output('example', self)
6
+
7
+ def self.transaction(config, schema, processor_count, &control)
8
+ task = {
9
+ 'message' => config.param('message', :string, default: "record")
10
+ }
11
+
12
+ puts "Example output started."
13
+ commit_reports = yield(task)
14
+ puts "Example output finished. Commit reports = #{commit_reports.to_json}"
15
+
16
+ return {}
17
+ end
18
+
19
+ def initialize(task, schema, index)
20
+ puts "Example output thread #{index}..."
21
+ super
22
+ @message = task.param('message', :string)
23
+ @records = 0
24
+ end
25
+
26
+ def close
27
+ end
28
+
29
+ def add(page)
30
+ page.each do |record|
31
+ hash = Hash[schema.names.zip(record)]
32
+ puts "#{@message}: #{hash.to_json}"
33
+ @records += 1
34
+ end
35
+ end
36
+
37
+ def finish
38
+ end
39
+
40
+ def abort
41
+ end
42
+
43
+ def commit
44
+ commit_report = {
45
+ "records" => @records
46
+ }
47
+ return commit_report
48
+ end
49
+ end
50
+
51
+ end
@@ -0,0 +1,66 @@
1
+ module Embulk
2
+ require 'json'
3
+
4
+ class DataSource < Hash
5
+ def param(key, type, options={})
6
+ if self.has_key?(key)
7
+ v = self[key]
8
+ value =
9
+ case type
10
+ when :integer
11
+ Integer(v)
12
+ when :float
13
+ Float(v)
14
+ when :string
15
+ String(v)
16
+ when :bool
17
+ !!v # TODO validation
18
+ when :hash
19
+ raise ArgumentError, "Invalid value for :hash" unless v.is_a?(Hash)
20
+ v
21
+ when :array
22
+ raise ArgumentError, "Invalid value for :array" unless v.is_a?(Array)
23
+ v
24
+ else
25
+ unless type.respond_to?(:load)
26
+ raise ArgumentError, "Unknown type #{type.to_s.dump}"
27
+ end
28
+ type.load(v)
29
+ end
30
+
31
+ elsif options.has_key?(:default)
32
+ value = options[:default]
33
+
34
+ else
35
+ raise "Required field #{key.to_s.dump} is not set"
36
+ end
37
+
38
+ return value
39
+ end
40
+
41
+ if Embulk.java?
42
+ def self.from_java_object(java_data_source_impl)
43
+ json = java_data_source_impl.toString
44
+ new.merge!(JSON.parse(json))
45
+ end
46
+
47
+ def self.from_ruby_hash(hash)
48
+ new.merge!(hash)
49
+ end
50
+
51
+ def java_object
52
+ json = to_json
53
+ Java::Injected::ModelManager.readObject(Java::DataSourceImpl.java_class, json.to_java)
54
+ end
55
+
56
+ def load_config(task_type)
57
+ Java::Injected::ModelManager.readObjectWithConfigSerDe(task_type.java_class, to_json.to_java)
58
+ end
59
+
60
+ def load_task(task_type)
61
+ Java::Injected::ModelManager.readObject(task_type.java_class, to_json.to_java)
62
+ end
63
+ end
64
+ end
65
+
66
+ end
@@ -0,0 +1,5 @@
1
+
2
+ module Embulk
3
+ class ConfigError < StandardError
4
+ end
5
+ end
@@ -0,0 +1,26 @@
1
+ module Embulk
2
+
3
+ class GuessCharset < GuessPlugin
4
+ Plugin.register_guess('charset', self)
5
+
6
+ def guess(config, sample_buffer)
7
+ # ICU4J
8
+ detector = com.ibm.icu.text.CharsetDetector.new
9
+ detector.setText(sample_buffer.to_java_bytes)
10
+ best_match = detector.detect
11
+ if best_match.getConfidence < 50
12
+ name = "UTF-8"
13
+ else
14
+ name = best_match.getName
15
+ if name == "ISO-8859-1"
16
+ # ISO-8859-1 means ASCII which is a subset
17
+ # of UTF-8 in most of cases due to lack of
18
+ # sample data set
19
+ name = "UTF-8"
20
+ end
21
+ end
22
+ return {"parser" => {"charset" => name}}
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,195 @@
1
+ module Embulk
2
+ require_relative 'time_format_guess'
3
+
4
+ class GuessCsv < LineGuessPlugin
5
+ Plugin.register_guess('csv', self)
6
+
7
+ DELIMITER_CANDIDATES = [
8
+ ",", "\t", "|"
9
+ ]
10
+
11
+ QUOTE_CANDIDATES = [
12
+ "\"", "'"
13
+ ]
14
+
15
+ def guess_lines(config, sample_lines)
16
+ delim = guess_delimiter(sample_lines)
17
+ unless delim
18
+ # not CSV file
19
+ return {}
20
+ end
21
+
22
+ parser_config = config["parser"] || {}
23
+ parser_guessed = {"type" => "csv", "delimiter" => delim}
24
+
25
+ quote = guess_quote(sample_lines, delim)
26
+ parser_guessed["quote"] = quote ? quote : ''
27
+
28
+ sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
29
+ first_types = guess_field_types(sample_records[0, 1])
30
+ other_types = guess_field_types(sample_records[1..-1])
31
+
32
+ if first_types.size <= 1 || other_types.size <= 1
33
+ # guess failed
34
+ return {}
35
+ end
36
+
37
+ unless parser_config.has_key?("header_line")
38
+ parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
39
+ end
40
+
41
+ unless parser_config.has_key?("columns")
42
+ if parser_guessed["header_line"] || parser_config["header_line"]
43
+ column_names = sample_records.first
44
+ else
45
+ column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
46
+ end
47
+ schema = []
48
+ column_names.zip(other_types).each do |name,(type,format)|
49
+ if name && type
50
+ if format
51
+ schema << {"name" => name, "type" => type, "format" => format}
52
+ else
53
+ schema << {"name" => name, "type" => type}
54
+ end
55
+ end
56
+ end
57
+ parser_guessed["columns"] = schema
58
+ end
59
+
60
+ return {"parser" => parser_guessed}
61
+ end
62
+
63
+ private
64
+
65
+ def guess_delimiter(sample_lines)
66
+ delim_weights = DELIMITER_CANDIDATES.map do |d|
67
+ counts = sample_lines.map {|line| line.count(d) }
68
+ total = array_sum(counts)
69
+ if total > 0
70
+ stddev = array_standard_deviation(counts)
71
+ stddev = 0.000000001 if stddev == 0.0
72
+ weight = total / stddev
73
+ [d, weight]
74
+ else
75
+ [nil, 0]
76
+ end
77
+ end
78
+
79
+ delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
80
+ if delim != nil && weight > 1
81
+ return delim
82
+ else
83
+ return nil
84
+ end
85
+ end
86
+
87
+ def guess_quote(sample_lines, delim)
88
+ delim_regexp = Regexp.escape(delim)
89
+ quote_weights = QUOTE_CANDIDATES.map do |q|
90
+ weights = sample_lines.map do |line|
91
+ q_regexp = Regexp.escape(q)
92
+ count = line.count(q)
93
+ if count > 0
94
+ weight = count
95
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
96
+ weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
97
+ weight
98
+ else
99
+ nil
100
+ end
101
+ end.compact
102
+ weights.empty? ? 0 : array_avg(weights)
103
+ end
104
+ quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
105
+ if weight >= 10.0
106
+ return quote
107
+ else
108
+ return nil
109
+ end
110
+ end
111
+
112
+ def guess_field_types(field_lines)
113
+ column_lines = []
114
+ field_lines.each do |fields|
115
+ fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
116
+ end
117
+ columns = column_lines.map do |types|
118
+ t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
119
+ if t.is_a?(TimestampMatch)
120
+ format = TimeFormatGuess.guess(types.map {|type| type.text })
121
+ ["timestamp", format]
122
+ else
123
+ [t]
124
+ end
125
+ end
126
+ return columns
127
+ end
128
+
129
+ TYPE_COALESCE = Hash[{
130
+ long: :double,
131
+ boolean: :long,
132
+ }.map {|k,v|
133
+ [[k.to_s, v.to_s].sort, v.to_s]
134
+ }]
135
+
136
+ def merge_type(type1, type2)
137
+ if type1 == type2
138
+ type1
139
+ elsif type1.nil? || type2.nil?
140
+ type1 || type2
141
+ else
142
+ TYPE_COALESCE[[type1, type2].sort] || "string"
143
+ end
144
+ end
145
+
146
+ class TimestampMatch < String
147
+ def initialize(text)
148
+ super("timestamp")
149
+ @text = text
150
+ end
151
+ attr_reader :text
152
+ end
153
+
154
+ def guess_type(str)
155
+ if ["true", "false"].include?(str)
156
+ return "boolean"
157
+ end
158
+
159
+ if TimeFormatGuess.guess(str)
160
+ return TimestampMatch.new(str)
161
+ end
162
+
163
+ if str.to_i.to_s == str
164
+ return "long"
165
+ end
166
+
167
+ if str.include?('.')
168
+ a, b = str.split(".", 2)
169
+ if a.to_i.to_s == a && b.to_i.to_s == b
170
+ return "double"
171
+ end
172
+ end
173
+
174
+ return "string"
175
+ end
176
+
177
+ def array_sum(array)
178
+ array.inject(0) {|r,i| r += i }
179
+ end
180
+
181
+ def array_avg(array)
182
+ array.inject(0.0) {|r,i| r += i } / array.size
183
+ end
184
+
185
+ def array_variance(array)
186
+ avg = array_avg(array)
187
+ array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
188
+ end
189
+
190
+ def array_standard_deviation(array)
191
+ Math.sqrt(array_variance(array))
192
+ end
193
+ end
194
+
195
+ end
@@ -0,0 +1,18 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ class GzipGuess < GuessPlugin
5
+ Plugin.register_guess('gzip', self)
6
+
7
+ GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
8
+
9
+ def guess(config, sample_buffer)
10
+ if sample_buffer[0,2] == GZIP_HEADER
11
+ return {"decoders" => [{"type" => "gzip"}]}
12
+ end
13
+ return {}
14
+ end
15
+ end
16
+
17
+ end
18
+ end
@@ -0,0 +1,20 @@
1
+ module Embulk
2
+
3
+ class GuessNewline < TextGuessPlugin
4
+ Plugin.register_guess('newline', self)
5
+
6
+ def guess_text(config, sample_text)
7
+ cr_count = sample_text.count("\r")
8
+ lf_count = sample_text.count("\n")
9
+ crlf_count = sample_text.scan(/\r\n/).length
10
+ if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
11
+ return {"parser" => {"newline" => "CRLF"}}
12
+ elsif cr_count > lf_count / 2
13
+ return {"parser" => {"newline" => "CR"}}
14
+ else
15
+ return {"parser" => {"newline" => "LF"}}
16
+ end
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,113 @@
1
+ module Embulk
2
+
3
+ class GuessPlugin
4
+ def guess(config, sample_buffer)
5
+ raise NotImplementedError, "GuessPlugin#guess(config, sample_buffer) must be implemented"
6
+ end
7
+
8
+ if Embulk.java?
9
+ def self.java_object
10
+ JavaAdapter.new(new)
11
+ end
12
+
13
+ def self.from_java_object(java_guess)
14
+ RubyAdapter.new(java_guess)
15
+ end
16
+
17
+ class RubyAdapter < Embulk::GuessPlugin
18
+ def initialized(java_guess)
19
+ @java_guess = java_guess
20
+ end
21
+
22
+ def guess(config, sample)
23
+ java_config = config.java_object
24
+ java_sample = sample.java_object
25
+ java_next_config = @java_guess.guess(java_config, java_sample)
26
+ return DataSource.from_java_object(java_next_config)
27
+ end
28
+
29
+ def java_object
30
+ @java_guess
31
+ end
32
+ end
33
+
34
+ class JavaAdapter
35
+ include Java::GuessPlugin
36
+
37
+ def initialize(ruby_guess)
38
+ @ruby_guess = ruby_guess
39
+ end
40
+
41
+ def guess(java_config, java_sample)
42
+ config = DataSource.from_java_object(java_config)
43
+ sample = Buffer.from_java_object(java_sample)
44
+ next_config_hash = @ruby_guess.guess(config, sample)
45
+ return DataSource.from_ruby_hash(next_config_hash).java_object
46
+ end
47
+ end
48
+ end
49
+ end
50
+
51
+ class TextGuessPlugin < GuessPlugin
52
+ def guess(config, sample)
53
+ # TODO pure-ruby LineDecoder implementation?
54
+ begin
55
+ task = config.load_config(Java::LineDecoder::DecoderTask)
56
+ rescue
57
+ # TODO log?
58
+ p $!
59
+ p $!.backtrace
60
+ return DataSource.new
61
+ end
62
+
63
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.java_object]]), task)
64
+ sample_text = ''
65
+ while decoder.nextFile
66
+ first = true
67
+ while line = decoder.poll
68
+ if first
69
+ first = false
70
+ else
71
+ sample_text << task.getNewline().getString()
72
+ end
73
+ sample_text << line
74
+ end
75
+ end
76
+
77
+ return guess_text(config, sample_text);
78
+ end
79
+
80
+ def guess_text(config, sample_text)
81
+ raise NotImplementedError, "TextGuessPlugin#guess_text(config, sample_text) must be implemented"
82
+ end
83
+ end
84
+
85
+ class LineGuessPlugin < GuessPlugin
86
+ def guess(config, sample)
87
+ # TODO pure-ruby LineDecoder implementation?
88
+ begin
89
+ task = config.load_config(Java::LineDecoder::DecoderTask)
90
+ rescue
91
+ # TODO log?
92
+ p $!
93
+ p $!.backtrace
94
+ return DataSource.new
95
+ end
96
+
97
+ decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.java_object]]), task)
98
+ sample_lines = []
99
+ while decoder.nextFile
100
+ while line = decoder.poll
101
+ sample_lines << line
102
+ end
103
+ end
104
+
105
+ return guess_lines(config, sample_lines);
106
+ end
107
+
108
+ def guess_lines(config, sample_lines)
109
+ raise NotImplementedError, "LineGuessPlugin#guess_lines(config, sample_lines) must be implemented"
110
+ end
111
+ end
112
+
113
+ end