embulk 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +13 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +33 -0
- data/README.md +117 -0
- data/Rakefile +58 -0
- data/bin/embulk +63 -0
- data/build.gradle +149 -0
- data/embulk-cli/build.gradle +6 -0
- data/embulk-cli/pom.xml +94 -0
- data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
- data/embulk-core/build.gradle +6 -0
- data/embulk-core/pom.xml +143 -0
- data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
- data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
- data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
- data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
- data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
- data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
- data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
- data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
- data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
- data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
- data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
- data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
- data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
- data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
- data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
- data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
- data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
- data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
- data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
- data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
- data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
- data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
- data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
- data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
- data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
- data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
- data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
- data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
- data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
- data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
- data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
- data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
- data/embulk-standards/build.gradle +6 -0
- data/embulk-standards/pom.xml +68 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
- data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
- data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
- data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
- data/embulk.gemspec +27 -0
- data/examples/config.yml +34 -0
- data/examples/csv/sample.csv.gz +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk.rb +16 -0
- data/lib/embulk/buffer.rb +17 -0
- data/lib/embulk/column.rb +47 -0
- data/lib/embulk/command/embulk.rb +39 -0
- data/lib/embulk/command/embulk_example.rb +32 -0
- data/lib/embulk/command/embulk_generate_bin.rb +62 -0
- data/lib/embulk/command/embulk_run.rb +243 -0
- data/lib/embulk/data/bundle/.bundle/config +3 -0
- data/lib/embulk/data/bundle/Gemfile +31 -0
- data/lib/embulk/data/bundle/Gemfile.lock +8 -0
- data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
- data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
- data/lib/embulk/data_source.rb +66 -0
- data/lib/embulk/error.rb +5 -0
- data/lib/embulk/guess_charset.rb +26 -0
- data/lib/embulk/guess_csv.rb +195 -0
- data/lib/embulk/guess_gzip.rb +18 -0
- data/lib/embulk/guess_newline.rb +20 -0
- data/lib/embulk/guess_plugin.rb +113 -0
- data/lib/embulk/input_plugin.rb +53 -0
- data/lib/embulk/java/bootstrap.rb +12 -0
- data/lib/embulk/java/imports.rb +26 -0
- data/lib/embulk/java/time_helper.rb +77 -0
- data/lib/embulk/output_plugin.rb +104 -0
- data/lib/embulk/page.rb +28 -0
- data/lib/embulk/page_builder.rb +22 -0
- data/lib/embulk/plugin.rb +152 -0
- data/lib/embulk/plugin_registry.rb +70 -0
- data/lib/embulk/schema.rb +85 -0
- data/lib/embulk/time_format_guess.rb +331 -0
- data/lib/embulk/version.rb +3 -0
- data/pom.xml +533 -0
- data/settings.gradle +5 -0
- metadata +370 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
module Embulk
|
2
|
+
|
3
|
+
class OutputExample < OutputPlugin
|
4
|
+
# output plugin file name must be: embulk/output_<name>.rb
|
5
|
+
Plugin.register_output('example', self)
|
6
|
+
|
7
|
+
def self.transaction(config, schema, processor_count, &control)
|
8
|
+
task = {
|
9
|
+
'message' => config.param('message', :string, default: "record")
|
10
|
+
}
|
11
|
+
|
12
|
+
puts "Example output started."
|
13
|
+
commit_reports = yield(task)
|
14
|
+
puts "Example output finished. Commit reports = #{commit_reports.to_json}"
|
15
|
+
|
16
|
+
return {}
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize(task, schema, index)
|
20
|
+
puts "Example output thread #{index}..."
|
21
|
+
super
|
22
|
+
@message = task.param('message', :string)
|
23
|
+
@records = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
def close
|
27
|
+
end
|
28
|
+
|
29
|
+
def add(page)
|
30
|
+
page.each do |record|
|
31
|
+
hash = Hash[schema.names.zip(record)]
|
32
|
+
puts "#{@message}: #{hash.to_json}"
|
33
|
+
@records += 1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def finish
|
38
|
+
end
|
39
|
+
|
40
|
+
def abort
|
41
|
+
end
|
42
|
+
|
43
|
+
def commit
|
44
|
+
commit_report = {
|
45
|
+
"records" => @records
|
46
|
+
}
|
47
|
+
return commit_report
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Embulk
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class DataSource < Hash
|
5
|
+
def param(key, type, options={})
|
6
|
+
if self.has_key?(key)
|
7
|
+
v = self[key]
|
8
|
+
value =
|
9
|
+
case type
|
10
|
+
when :integer
|
11
|
+
Integer(v)
|
12
|
+
when :float
|
13
|
+
Float(v)
|
14
|
+
when :string
|
15
|
+
String(v)
|
16
|
+
when :bool
|
17
|
+
!!v # TODO validation
|
18
|
+
when :hash
|
19
|
+
raise ArgumentError, "Invalid value for :hash" unless v.is_a?(Hash)
|
20
|
+
v
|
21
|
+
when :array
|
22
|
+
raise ArgumentError, "Invalid value for :array" unless v.is_a?(Array)
|
23
|
+
v
|
24
|
+
else
|
25
|
+
unless type.respond_to?(:load)
|
26
|
+
raise ArgumentError, "Unknown type #{type.to_s.dump}"
|
27
|
+
end
|
28
|
+
type.load(v)
|
29
|
+
end
|
30
|
+
|
31
|
+
elsif options.has_key?(:default)
|
32
|
+
value = options[:default]
|
33
|
+
|
34
|
+
else
|
35
|
+
raise "Required field #{key.to_s.dump} is not set"
|
36
|
+
end
|
37
|
+
|
38
|
+
return value
|
39
|
+
end
|
40
|
+
|
41
|
+
if Embulk.java?
|
42
|
+
def self.from_java_object(java_data_source_impl)
|
43
|
+
json = java_data_source_impl.toString
|
44
|
+
new.merge!(JSON.parse(json))
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.from_ruby_hash(hash)
|
48
|
+
new.merge!(hash)
|
49
|
+
end
|
50
|
+
|
51
|
+
def java_object
|
52
|
+
json = to_json
|
53
|
+
Java::Injected::ModelManager.readObject(Java::DataSourceImpl.java_class, json.to_java)
|
54
|
+
end
|
55
|
+
|
56
|
+
def load_config(task_type)
|
57
|
+
Java::Injected::ModelManager.readObjectWithConfigSerDe(task_type.java_class, to_json.to_java)
|
58
|
+
end
|
59
|
+
|
60
|
+
def load_task(task_type)
|
61
|
+
Java::Injected::ModelManager.readObject(task_type.java_class, to_json.to_java)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
data/lib/embulk/error.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
module Embulk
|
2
|
+
|
3
|
+
class GuessCharset < GuessPlugin
|
4
|
+
Plugin.register_guess('charset', self)
|
5
|
+
|
6
|
+
def guess(config, sample_buffer)
|
7
|
+
# ICU4J
|
8
|
+
detector = com.ibm.icu.text.CharsetDetector.new
|
9
|
+
detector.setText(sample_buffer.to_java_bytes)
|
10
|
+
best_match = detector.detect
|
11
|
+
if best_match.getConfidence < 50
|
12
|
+
name = "UTF-8"
|
13
|
+
else
|
14
|
+
name = best_match.getName
|
15
|
+
if name == "ISO-8859-1"
|
16
|
+
# ISO-8859-1 means ASCII which is a subset
|
17
|
+
# of UTF-8 in most of cases due to lack of
|
18
|
+
# sample data set
|
19
|
+
name = "UTF-8"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
return {"parser" => {"charset" => name}}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,195 @@
|
|
1
|
+
module Embulk
|
2
|
+
require_relative 'time_format_guess'
|
3
|
+
|
4
|
+
class GuessCsv < LineGuessPlugin
|
5
|
+
Plugin.register_guess('csv', self)
|
6
|
+
|
7
|
+
DELIMITER_CANDIDATES = [
|
8
|
+
",", "\t", "|"
|
9
|
+
]
|
10
|
+
|
11
|
+
QUOTE_CANDIDATES = [
|
12
|
+
"\"", "'"
|
13
|
+
]
|
14
|
+
|
15
|
+
def guess_lines(config, sample_lines)
|
16
|
+
delim = guess_delimiter(sample_lines)
|
17
|
+
unless delim
|
18
|
+
# not CSV file
|
19
|
+
return {}
|
20
|
+
end
|
21
|
+
|
22
|
+
parser_config = config["parser"] || {}
|
23
|
+
parser_guessed = {"type" => "csv", "delimiter" => delim}
|
24
|
+
|
25
|
+
quote = guess_quote(sample_lines, delim)
|
26
|
+
parser_guessed["quote"] = quote ? quote : ''
|
27
|
+
|
28
|
+
sample_records = sample_lines.map {|line| line.split(delim) } # TODO use CsvTokenizer
|
29
|
+
first_types = guess_field_types(sample_records[0, 1])
|
30
|
+
other_types = guess_field_types(sample_records[1..-1])
|
31
|
+
|
32
|
+
if first_types.size <= 1 || other_types.size <= 1
|
33
|
+
# guess failed
|
34
|
+
return {}
|
35
|
+
end
|
36
|
+
|
37
|
+
unless parser_config.has_key?("header_line")
|
38
|
+
parser_guessed["header_line"] = (first_types != other_types && !first_types.any? {|t| t != ["string"] })
|
39
|
+
end
|
40
|
+
|
41
|
+
unless parser_config.has_key?("columns")
|
42
|
+
if parser_guessed["header_line"] || parser_config["header_line"]
|
43
|
+
column_names = sample_records.first
|
44
|
+
else
|
45
|
+
column_names = (0..other_types.size).to_a.map {|i| "c#{i}" }
|
46
|
+
end
|
47
|
+
schema = []
|
48
|
+
column_names.zip(other_types).each do |name,(type,format)|
|
49
|
+
if name && type
|
50
|
+
if format
|
51
|
+
schema << {"name" => name, "type" => type, "format" => format}
|
52
|
+
else
|
53
|
+
schema << {"name" => name, "type" => type}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
parser_guessed["columns"] = schema
|
58
|
+
end
|
59
|
+
|
60
|
+
return {"parser" => parser_guessed}
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def guess_delimiter(sample_lines)
|
66
|
+
delim_weights = DELIMITER_CANDIDATES.map do |d|
|
67
|
+
counts = sample_lines.map {|line| line.count(d) }
|
68
|
+
total = array_sum(counts)
|
69
|
+
if total > 0
|
70
|
+
stddev = array_standard_deviation(counts)
|
71
|
+
stddev = 0.000000001 if stddev == 0.0
|
72
|
+
weight = total / stddev
|
73
|
+
[d, weight]
|
74
|
+
else
|
75
|
+
[nil, 0]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
delim, weight = *delim_weights.sort_by {|d,weight| weight }.last
|
80
|
+
if delim != nil && weight > 1
|
81
|
+
return delim
|
82
|
+
else
|
83
|
+
return nil
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def guess_quote(sample_lines, delim)
|
88
|
+
delim_regexp = Regexp.escape(delim)
|
89
|
+
quote_weights = QUOTE_CANDIDATES.map do |q|
|
90
|
+
weights = sample_lines.map do |line|
|
91
|
+
q_regexp = Regexp.escape(q)
|
92
|
+
count = line.count(q)
|
93
|
+
if count > 0
|
94
|
+
weight = count
|
95
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{q_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 20
|
96
|
+
weight += line.scan(/(?:\A|#{delim_regexp})\s*#{q_regexp}(?:(?!#{delim_regexp}).)*\s*#{q_regexp}(?:$|#{delim_regexp})/).size * 40
|
97
|
+
weight
|
98
|
+
else
|
99
|
+
nil
|
100
|
+
end
|
101
|
+
end.compact
|
102
|
+
weights.empty? ? 0 : array_avg(weights)
|
103
|
+
end
|
104
|
+
quote, weight = QUOTE_CANDIDATES.zip(quote_weights).sort_by {|q,w| w }.last
|
105
|
+
if weight >= 10.0
|
106
|
+
return quote
|
107
|
+
else
|
108
|
+
return nil
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def guess_field_types(field_lines)
|
113
|
+
column_lines = []
|
114
|
+
field_lines.each do |fields|
|
115
|
+
fields.each_with_index {|field,i| (column_lines[i] ||= []) << guess_type(field) }
|
116
|
+
end
|
117
|
+
columns = column_lines.map do |types|
|
118
|
+
t = types.inject(nil) {|r,t| merge_type(r,t) } || "string"
|
119
|
+
if t.is_a?(TimestampMatch)
|
120
|
+
format = TimeFormatGuess.guess(types.map {|type| type.text })
|
121
|
+
["timestamp", format]
|
122
|
+
else
|
123
|
+
[t]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
return columns
|
127
|
+
end
|
128
|
+
|
129
|
+
TYPE_COALESCE = Hash[{
|
130
|
+
long: :double,
|
131
|
+
boolean: :long,
|
132
|
+
}.map {|k,v|
|
133
|
+
[[k.to_s, v.to_s].sort, v.to_s]
|
134
|
+
}]
|
135
|
+
|
136
|
+
def merge_type(type1, type2)
|
137
|
+
if type1 == type2
|
138
|
+
type1
|
139
|
+
elsif type1.nil? || type2.nil?
|
140
|
+
type1 || type2
|
141
|
+
else
|
142
|
+
TYPE_COALESCE[[type1, type2].sort] || "string"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
class TimestampMatch < String
|
147
|
+
def initialize(text)
|
148
|
+
super("timestamp")
|
149
|
+
@text = text
|
150
|
+
end
|
151
|
+
attr_reader :text
|
152
|
+
end
|
153
|
+
|
154
|
+
def guess_type(str)
|
155
|
+
if ["true", "false"].include?(str)
|
156
|
+
return "boolean"
|
157
|
+
end
|
158
|
+
|
159
|
+
if TimeFormatGuess.guess(str)
|
160
|
+
return TimestampMatch.new(str)
|
161
|
+
end
|
162
|
+
|
163
|
+
if str.to_i.to_s == str
|
164
|
+
return "long"
|
165
|
+
end
|
166
|
+
|
167
|
+
if str.include?('.')
|
168
|
+
a, b = str.split(".", 2)
|
169
|
+
if a.to_i.to_s == a && b.to_i.to_s == b
|
170
|
+
return "double"
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
return "string"
|
175
|
+
end
|
176
|
+
|
177
|
+
def array_sum(array)
|
178
|
+
array.inject(0) {|r,i| r += i }
|
179
|
+
end
|
180
|
+
|
181
|
+
def array_avg(array)
|
182
|
+
array.inject(0.0) {|r,i| r += i } / array.size
|
183
|
+
end
|
184
|
+
|
185
|
+
def array_variance(array)
|
186
|
+
avg = array_avg(array)
|
187
|
+
array.inject(0.0) {|r,i| r += (i - avg) ** 2 } / array.size
|
188
|
+
end
|
189
|
+
|
190
|
+
def array_standard_deviation(array)
|
191
|
+
Math.sqrt(array_variance(array))
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
class GzipGuess < GuessPlugin
|
5
|
+
Plugin.register_guess('gzip', self)
|
6
|
+
|
7
|
+
GZIP_HEADER = "\x1f\x8b".force_encoding('ASCII-8BIT').freeze
|
8
|
+
|
9
|
+
def guess(config, sample_buffer)
|
10
|
+
if sample_buffer[0,2] == GZIP_HEADER
|
11
|
+
return {"decoders" => [{"type" => "gzip"}]}
|
12
|
+
end
|
13
|
+
return {}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Embulk
|
2
|
+
|
3
|
+
class GuessNewline < TextGuessPlugin
|
4
|
+
Plugin.register_guess('newline', self)
|
5
|
+
|
6
|
+
def guess_text(config, sample_text)
|
7
|
+
cr_count = sample_text.count("\r")
|
8
|
+
lf_count = sample_text.count("\n")
|
9
|
+
crlf_count = sample_text.scan(/\r\n/).length
|
10
|
+
if crlf_count > cr_count / 2 && crlf_count > lf_count / 2
|
11
|
+
return {"parser" => {"newline" => "CRLF"}}
|
12
|
+
elsif cr_count > lf_count / 2
|
13
|
+
return {"parser" => {"newline" => "CR"}}
|
14
|
+
else
|
15
|
+
return {"parser" => {"newline" => "LF"}}
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -0,0 +1,113 @@
|
|
1
|
+
module Embulk
|
2
|
+
|
3
|
+
class GuessPlugin
|
4
|
+
def guess(config, sample_buffer)
|
5
|
+
raise NotImplementedError, "GuessPlugin#guess(config, sample_buffer) must be implemented"
|
6
|
+
end
|
7
|
+
|
8
|
+
if Embulk.java?
|
9
|
+
def self.java_object
|
10
|
+
JavaAdapter.new(new)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.from_java_object(java_guess)
|
14
|
+
RubyAdapter.new(java_guess)
|
15
|
+
end
|
16
|
+
|
17
|
+
class RubyAdapter < Embulk::GuessPlugin
|
18
|
+
def initialized(java_guess)
|
19
|
+
@java_guess = java_guess
|
20
|
+
end
|
21
|
+
|
22
|
+
def guess(config, sample)
|
23
|
+
java_config = config.java_object
|
24
|
+
java_sample = sample.java_object
|
25
|
+
java_next_config = @java_guess.guess(java_config, java_sample)
|
26
|
+
return DataSource.from_java_object(java_next_config)
|
27
|
+
end
|
28
|
+
|
29
|
+
def java_object
|
30
|
+
@java_guess
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
class JavaAdapter
|
35
|
+
include Java::GuessPlugin
|
36
|
+
|
37
|
+
def initialize(ruby_guess)
|
38
|
+
@ruby_guess = ruby_guess
|
39
|
+
end
|
40
|
+
|
41
|
+
def guess(java_config, java_sample)
|
42
|
+
config = DataSource.from_java_object(java_config)
|
43
|
+
sample = Buffer.from_java_object(java_sample)
|
44
|
+
next_config_hash = @ruby_guess.guess(config, sample)
|
45
|
+
return DataSource.from_ruby_hash(next_config_hash).java_object
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class TextGuessPlugin < GuessPlugin
|
52
|
+
def guess(config, sample)
|
53
|
+
# TODO pure-ruby LineDecoder implementation?
|
54
|
+
begin
|
55
|
+
task = config.load_config(Java::LineDecoder::DecoderTask)
|
56
|
+
rescue
|
57
|
+
# TODO log?
|
58
|
+
p $!
|
59
|
+
p $!.backtrace
|
60
|
+
return DataSource.new
|
61
|
+
end
|
62
|
+
|
63
|
+
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.java_object]]), task)
|
64
|
+
sample_text = ''
|
65
|
+
while decoder.nextFile
|
66
|
+
first = true
|
67
|
+
while line = decoder.poll
|
68
|
+
if first
|
69
|
+
first = false
|
70
|
+
else
|
71
|
+
sample_text << task.getNewline().getString()
|
72
|
+
end
|
73
|
+
sample_text << line
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
return guess_text(config, sample_text);
|
78
|
+
end
|
79
|
+
|
80
|
+
def guess_text(config, sample_text)
|
81
|
+
raise NotImplementedError, "TextGuessPlugin#guess_text(config, sample_text) must be implemented"
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
class LineGuessPlugin < GuessPlugin
|
86
|
+
def guess(config, sample)
|
87
|
+
# TODO pure-ruby LineDecoder implementation?
|
88
|
+
begin
|
89
|
+
task = config.load_config(Java::LineDecoder::DecoderTask)
|
90
|
+
rescue
|
91
|
+
# TODO log?
|
92
|
+
p $!
|
93
|
+
p $!.backtrace
|
94
|
+
return DataSource.new
|
95
|
+
end
|
96
|
+
|
97
|
+
decoder = Java::LineDecoder.new(Java::ListFileInput.new([[sample.java_object]]), task)
|
98
|
+
sample_lines = []
|
99
|
+
while decoder.nextFile
|
100
|
+
while line = decoder.poll
|
101
|
+
sample_lines << line
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
return guess_lines(config, sample_lines);
|
106
|
+
end
|
107
|
+
|
108
|
+
def guess_lines(config, sample_lines)
|
109
|
+
raise NotImplementedError, "LineGuessPlugin#guess_lines(config, sample_lines) must be implemented"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|