embulk 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +13 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +33 -0
- data/README.md +117 -0
- data/Rakefile +58 -0
- data/bin/embulk +63 -0
- data/build.gradle +149 -0
- data/embulk-cli/build.gradle +6 -0
- data/embulk-cli/pom.xml +94 -0
- data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
- data/embulk-core/build.gradle +6 -0
- data/embulk-core/pom.xml +143 -0
- data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
- data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
- data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
- data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
- data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
- data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
- data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
- data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
- data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
- data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
- data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
- data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
- data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
- data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
- data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
- data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
- data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
- data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
- data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
- data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
- data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
- data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
- data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
- data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
- data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
- data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
- data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
- data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
- data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
- data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
- data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
- data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
- data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
- data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
- data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
- data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
- data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
- data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
- data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
- data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
- data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
- data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
- data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
- data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
- data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
- data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
- data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
- data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
- data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
- data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
- data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
- data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
- data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
- data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
- data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
- data/embulk-standards/build.gradle +6 -0
- data/embulk-standards/pom.xml +68 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
- data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
- data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
- data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
- data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
- data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
- data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
- data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
- data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
- data/embulk.gemspec +27 -0
- data/examples/config.yml +34 -0
- data/examples/csv/sample.csv.gz +0 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk.rb +16 -0
- data/lib/embulk/buffer.rb +17 -0
- data/lib/embulk/column.rb +47 -0
- data/lib/embulk/command/embulk.rb +39 -0
- data/lib/embulk/command/embulk_example.rb +32 -0
- data/lib/embulk/command/embulk_generate_bin.rb +62 -0
- data/lib/embulk/command/embulk_run.rb +243 -0
- data/lib/embulk/data/bundle/.bundle/config +3 -0
- data/lib/embulk/data/bundle/Gemfile +31 -0
- data/lib/embulk/data/bundle/Gemfile.lock +8 -0
- data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
- data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
- data/lib/embulk/data_source.rb +66 -0
- data/lib/embulk/error.rb +5 -0
- data/lib/embulk/guess_charset.rb +26 -0
- data/lib/embulk/guess_csv.rb +195 -0
- data/lib/embulk/guess_gzip.rb +18 -0
- data/lib/embulk/guess_newline.rb +20 -0
- data/lib/embulk/guess_plugin.rb +113 -0
- data/lib/embulk/input_plugin.rb +53 -0
- data/lib/embulk/java/bootstrap.rb +12 -0
- data/lib/embulk/java/imports.rb +26 -0
- data/lib/embulk/java/time_helper.rb +77 -0
- data/lib/embulk/output_plugin.rb +104 -0
- data/lib/embulk/page.rb +28 -0
- data/lib/embulk/page_builder.rb +22 -0
- data/lib/embulk/plugin.rb +152 -0
- data/lib/embulk/plugin_registry.rb +70 -0
- data/lib/embulk/schema.rb +85 -0
- data/lib/embulk/time_format_guess.rb +331 -0
- data/lib/embulk/version.rb +3 -0
- data/pom.xml +533 -0
- data/settings.gradle +5 -0
- metadata +370 -0
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|
3
|
+
<modelVersion>4.0.0</modelVersion>
|
|
4
|
+
|
|
5
|
+
<parent>
|
|
6
|
+
<groupId>org.embulk</groupId>
|
|
7
|
+
<artifactId>embulk-parent</artifactId>
|
|
8
|
+
<version>0.1.0-SNAPSHOT</version>
|
|
9
|
+
</parent>
|
|
10
|
+
|
|
11
|
+
<artifactId>embulk-standards</artifactId>
|
|
12
|
+
<name>embulk-standards</name>
|
|
13
|
+
|
|
14
|
+
<dependencies>
|
|
15
|
+
<dependency>
|
|
16
|
+
<groupId>org.embulk</groupId>
|
|
17
|
+
<artifactId>embulk-core</artifactId>
|
|
18
|
+
</dependency>
|
|
19
|
+
|
|
20
|
+
<dependency>
|
|
21
|
+
<groupId>org.embulk</groupId>
|
|
22
|
+
<artifactId>embulk-core</artifactId>
|
|
23
|
+
<type>test-jar</type>
|
|
24
|
+
<scope>test</scope>
|
|
25
|
+
</dependency>
|
|
26
|
+
|
|
27
|
+
<dependency>
|
|
28
|
+
<groupId>com.google.guava</groupId>
|
|
29
|
+
<artifactId>guava</artifactId>
|
|
30
|
+
</dependency>
|
|
31
|
+
|
|
32
|
+
<dependency>
|
|
33
|
+
<groupId>com.google.inject</groupId>
|
|
34
|
+
<artifactId>guice</artifactId>
|
|
35
|
+
</dependency>
|
|
36
|
+
|
|
37
|
+
<dependency>
|
|
38
|
+
<groupId>javax.validation</groupId>
|
|
39
|
+
<artifactId>validation-api</artifactId>
|
|
40
|
+
</dependency>
|
|
41
|
+
|
|
42
|
+
<dependency>
|
|
43
|
+
<groupId>com.fasterxml.jackson.core</groupId>
|
|
44
|
+
<artifactId>jackson-databind</artifactId>
|
|
45
|
+
</dependency>
|
|
46
|
+
|
|
47
|
+
<dependency>
|
|
48
|
+
<groupId>org.slf4j</groupId>
|
|
49
|
+
<artifactId>slf4j-api</artifactId>
|
|
50
|
+
</dependency>
|
|
51
|
+
|
|
52
|
+
<dependency>
|
|
53
|
+
<groupId>com.amazonaws</groupId>
|
|
54
|
+
<artifactId>aws-java-sdk</artifactId>
|
|
55
|
+
<version>1.5.2</version>
|
|
56
|
+
</dependency>
|
|
57
|
+
|
|
58
|
+
<dependency>
|
|
59
|
+
<groupId>junit</groupId>
|
|
60
|
+
<artifactId>junit</artifactId>
|
|
61
|
+
</dependency>
|
|
62
|
+
|
|
63
|
+
<dependency>
|
|
64
|
+
<groupId>org.mockito</groupId>
|
|
65
|
+
<artifactId>mockito-core</artifactId>
|
|
66
|
+
</dependency>
|
|
67
|
+
</dependencies>
|
|
68
|
+
</project>
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
package org.embulk.standards;
|
|
2
|
+
|
|
3
|
+
import com.google.common.collect.ImmutableBiMap;
|
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
|
5
|
+
import org.embulk.config.Config;
|
|
6
|
+
import org.embulk.config.ConfigDefault;
|
|
7
|
+
import org.embulk.spi.type.TimestampType;
|
|
8
|
+
import org.embulk.spi.time.Timestamp;
|
|
9
|
+
import org.embulk.spi.time.TimestampFormatter;
|
|
10
|
+
import org.embulk.config.TaskSource;
|
|
11
|
+
import org.embulk.config.ConfigSource;
|
|
12
|
+
import org.embulk.spi.Column;
|
|
13
|
+
import org.embulk.spi.Schema;
|
|
14
|
+
import org.embulk.spi.SchemaVisitor;
|
|
15
|
+
import org.embulk.spi.FormatterPlugin;
|
|
16
|
+
import org.embulk.spi.Page;
|
|
17
|
+
import org.embulk.spi.PageOutput;
|
|
18
|
+
import org.embulk.spi.PageReader;
|
|
19
|
+
import org.embulk.spi.Exec;
|
|
20
|
+
import org.embulk.spi.FileOutput;
|
|
21
|
+
import org.embulk.spi.util.LineEncoder;
|
|
22
|
+
|
|
23
|
+
import java.util.Map;
|
|
24
|
+
|
|
25
|
+
public class CsvFormatterPlugin
|
|
26
|
+
implements FormatterPlugin
|
|
27
|
+
{
|
|
28
|
+
public interface PluginTask
|
|
29
|
+
extends LineEncoder.EncoderTask, TimestampFormatter.FormatterTask
|
|
30
|
+
{
|
|
31
|
+
@Config("header_line")
|
|
32
|
+
@ConfigDefault("true")
|
|
33
|
+
public boolean getHeaderLine();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
@Override
|
|
37
|
+
public void transaction(ConfigSource config, Schema schema,
|
|
38
|
+
FormatterPlugin.Control control)
|
|
39
|
+
{
|
|
40
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
|
41
|
+
control.run(task.dump());
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
private Map<Integer, TimestampFormatter> newTimestampFormatters(
|
|
45
|
+
TimestampFormatter.FormatterTask task, Schema schema)
|
|
46
|
+
{
|
|
47
|
+
ImmutableMap.Builder<Integer, TimestampFormatter> builder = new ImmutableBiMap.Builder<>();
|
|
48
|
+
for (Column column : schema.getColumns()) {
|
|
49
|
+
if (column.getType() instanceof TimestampType) {
|
|
50
|
+
TimestampType tt = (TimestampType) column.getType();
|
|
51
|
+
builder.put(column.getIndex(), new TimestampFormatter(tt.getFormat(), task));
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return builder.build();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
@Override
|
|
58
|
+
public PageOutput open(TaskSource taskSource, final Schema schema,
|
|
59
|
+
FileOutput output)
|
|
60
|
+
{
|
|
61
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
|
62
|
+
final LineEncoder encoder = new LineEncoder(output, task);
|
|
63
|
+
final Map<Integer, TimestampFormatter> timestampFormatters =
|
|
64
|
+
newTimestampFormatters(task, schema);
|
|
65
|
+
|
|
66
|
+
// create a file
|
|
67
|
+
encoder.nextFile();
|
|
68
|
+
|
|
69
|
+
// write header
|
|
70
|
+
if (task.getHeaderLine()) {
|
|
71
|
+
writeHeader(schema, encoder);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return new PageOutput() {
|
|
75
|
+
private final PageReader pageReader = new PageReader(schema);
|
|
76
|
+
|
|
77
|
+
public void add(Page page)
|
|
78
|
+
{
|
|
79
|
+
pageReader.setPage(page);
|
|
80
|
+
while (pageReader.nextRecord()) {
|
|
81
|
+
schema.visitColumns(new SchemaVisitor() {
|
|
82
|
+
public void booleanColumn(Column column)
|
|
83
|
+
{
|
|
84
|
+
addDelimiter(column);
|
|
85
|
+
if (!pageReader.isNull(column)) {
|
|
86
|
+
encoder.addText(Boolean.toString(pageReader.getBoolean(column)));
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
public void longColumn(Column column)
|
|
91
|
+
{
|
|
92
|
+
addDelimiter(column);
|
|
93
|
+
if (!pageReader.isNull(column)) {
|
|
94
|
+
encoder.addText(Long.toString(pageReader.getLong(column)));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
public void doubleColumn(Column column)
|
|
99
|
+
{
|
|
100
|
+
addDelimiter(column);
|
|
101
|
+
if (!pageReader.isNull(column)) {
|
|
102
|
+
encoder.addText(Double.toString(pageReader.getDouble(column)));
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
public void stringColumn(Column column)
|
|
107
|
+
{
|
|
108
|
+
addDelimiter(column);
|
|
109
|
+
if (!pageReader.isNull(column)) {
|
|
110
|
+
// TODO escape and quoting
|
|
111
|
+
encoder.addText(pageReader.getString(column));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
public void timestampColumn(Column column)
|
|
116
|
+
{
|
|
117
|
+
addDelimiter(column);
|
|
118
|
+
if (!pageReader.isNull(column)) {
|
|
119
|
+
Timestamp value = pageReader.getTimestamp(column);
|
|
120
|
+
encoder.addText(timestampFormatters.get(column.getIndex()).format(value));
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
private void addDelimiter(Column column)
|
|
125
|
+
{
|
|
126
|
+
if (column.getIndex() != 0) {
|
|
127
|
+
encoder.addText(",");
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
encoder.addNewLine();
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
public void finish()
|
|
137
|
+
{
|
|
138
|
+
encoder.finish();
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
public void close()
|
|
142
|
+
{
|
|
143
|
+
encoder.close();
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
private void writeHeader(Schema schema, LineEncoder encoder)
|
|
149
|
+
{
|
|
150
|
+
for (Column column : schema.getColumns()) {
|
|
151
|
+
if (column.getIndex() != 0) {
|
|
152
|
+
encoder.addText(",");
|
|
153
|
+
}
|
|
154
|
+
encoder.addText(column.getName());
|
|
155
|
+
}
|
|
156
|
+
encoder.addNewLine();
|
|
157
|
+
}
|
|
158
|
+
}
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
package org.embulk.standards;
|
|
2
|
+
|
|
3
|
+
import com.google.common.base.Preconditions;
|
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
|
5
|
+
import com.google.common.base.Optional;
|
|
6
|
+
import org.embulk.config.Task;
|
|
7
|
+
import org.embulk.config.Config;
|
|
8
|
+
import org.embulk.config.ConfigDefault;
|
|
9
|
+
import org.embulk.config.ConfigSource;
|
|
10
|
+
import org.embulk.config.TaskSource;
|
|
11
|
+
import org.embulk.spi.type.TimestampType;
|
|
12
|
+
import org.embulk.spi.time.TimestampParser;
|
|
13
|
+
import org.embulk.spi.time.TimestampParseException;
|
|
14
|
+
import org.embulk.spi.Column;
|
|
15
|
+
import org.embulk.spi.Schema;
|
|
16
|
+
import org.embulk.spi.SchemaConfig;
|
|
17
|
+
import org.embulk.spi.SchemaVisitor;
|
|
18
|
+
import org.embulk.spi.PageBuilder;
|
|
19
|
+
import org.embulk.spi.ParserPlugin;
|
|
20
|
+
import org.embulk.spi.Exec;
|
|
21
|
+
import org.embulk.spi.FileInput;
|
|
22
|
+
import org.embulk.spi.PageOutput;
|
|
23
|
+
import org.embulk.spi.BufferAllocator;
|
|
24
|
+
import org.embulk.spi.util.LineDecoder;
|
|
25
|
+
import org.slf4j.Logger;
|
|
26
|
+
|
|
27
|
+
import java.util.Map;
|
|
28
|
+
|
|
29
|
+
public class CsvParserPlugin
|
|
30
|
+
implements ParserPlugin
|
|
31
|
+
{
|
|
32
|
+
public interface PluginTask
|
|
33
|
+
extends Task, LineDecoder.DecoderTask, TimestampParser.ParserTask
|
|
34
|
+
{
|
|
35
|
+
@Config("columns")
|
|
36
|
+
public SchemaConfig getSchemaConfig();
|
|
37
|
+
|
|
38
|
+
@Config("header_line") // how to set default value?? TODO @Default("true")
|
|
39
|
+
@ConfigDefault("false")
|
|
40
|
+
public boolean getHeaderLine();
|
|
41
|
+
|
|
42
|
+
@Config("delimiter")
|
|
43
|
+
@ConfigDefault("\",\"")
|
|
44
|
+
public char getDelimiterChar();
|
|
45
|
+
|
|
46
|
+
@Config("quote")
|
|
47
|
+
@ConfigDefault("\"\\\"\"")
|
|
48
|
+
public char getQuoteChar();
|
|
49
|
+
|
|
50
|
+
@Config("escape")
|
|
51
|
+
@ConfigDefault("\"\\\\\"")
|
|
52
|
+
public char getEscapeChar();
|
|
53
|
+
|
|
54
|
+
// Null value handling: if the CsvParser found 'non-quoted empty string's,
|
|
55
|
+
// it replaces them to string that users specified like "\N", "NULL".
|
|
56
|
+
@Config("null_string")
|
|
57
|
+
@ConfigDefault("null")
|
|
58
|
+
public Optional<String> getNullString();
|
|
59
|
+
|
|
60
|
+
@Config("trim_if_not_quoted")
|
|
61
|
+
@ConfigDefault("false")
|
|
62
|
+
public boolean getTrimIfNotQuoted();
|
|
63
|
+
|
|
64
|
+
@Config("max_quoted_size_limit")
|
|
65
|
+
@ConfigDefault("131072") //128kB
|
|
66
|
+
public long getMaxQuotedSizeLimit();
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
private final Logger log;
|
|
70
|
+
|
|
71
|
+
public CsvParserPlugin()
|
|
72
|
+
{
|
|
73
|
+
log = Exec.getLogger(CsvParserPlugin.class);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
@Override
|
|
77
|
+
public void transaction(ConfigSource config, ParserPlugin.Control control)
|
|
78
|
+
{
|
|
79
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
|
80
|
+
control.run(task.dump(), task.getSchemaConfig().toSchema());
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
private Map<Integer, TimestampParser> newTimestampParsers(
|
|
84
|
+
TimestampParser.ParserTask task, Schema schema)
|
|
85
|
+
{
|
|
86
|
+
ImmutableMap.Builder<Integer, TimestampParser> builder = new ImmutableMap.Builder<>();
|
|
87
|
+
for (Column column : schema.getColumns()) {
|
|
88
|
+
if (column.getType() instanceof TimestampType) {
|
|
89
|
+
TimestampType tt = (TimestampType) column.getType();
|
|
90
|
+
builder.put(column.getIndex(), new TimestampParser(tt.getFormat(), task));
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return builder.build();
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
@Override
|
|
97
|
+
public void run(TaskSource taskSource, final Schema schema,
|
|
98
|
+
FileInput input, PageOutput output)
|
|
99
|
+
{
|
|
100
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
|
101
|
+
final Map<Integer, TimestampParser> timestampFormatters = newTimestampParsers(task, schema);
|
|
102
|
+
final CsvTokenizer tokenizer = new CsvTokenizer(new LineDecoder(input, task), task);
|
|
103
|
+
final String nullStringOrNull = task.getNullString().orNull();
|
|
104
|
+
boolean skipHeaderLine = task.getHeaderLine();
|
|
105
|
+
|
|
106
|
+
try (final PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output)) {
|
|
107
|
+
while (tokenizer.nextFile()) {
|
|
108
|
+
if (skipHeaderLine) {
|
|
109
|
+
// skip the first line
|
|
110
|
+
if (tokenizer.nextRecord()) {
|
|
111
|
+
for (int i=0; i < schema.getColumnCount(); i++) {
|
|
112
|
+
tokenizer.nextColumn(); // TODO check return value?
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
while (true) {
|
|
118
|
+
try {
|
|
119
|
+
if (!tokenizer.nextRecord()) {
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
schema.visitColumns(new SchemaVisitor() {
|
|
124
|
+
public void booleanColumn(Column column)
|
|
125
|
+
{
|
|
126
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
|
127
|
+
if (v == null) {
|
|
128
|
+
pageBuilder.setNull(column);
|
|
129
|
+
} else {
|
|
130
|
+
pageBuilder.setBoolean(column, Boolean.parseBoolean(v));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
public void longColumn(Column column)
|
|
135
|
+
{
|
|
136
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
|
137
|
+
if (v == null) {
|
|
138
|
+
pageBuilder.setNull(column);
|
|
139
|
+
} else {
|
|
140
|
+
try {
|
|
141
|
+
pageBuilder.setLong(column, Long.parseLong(v));
|
|
142
|
+
} catch (NumberFormatException e) {
|
|
143
|
+
// TODO support default value
|
|
144
|
+
throw new CsvRecordValidateException(e);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
public void doubleColumn(Column column)
|
|
150
|
+
{
|
|
151
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
|
152
|
+
if (v == null) {
|
|
153
|
+
pageBuilder.setNull(column);
|
|
154
|
+
} else {
|
|
155
|
+
try {
|
|
156
|
+
pageBuilder.setDouble(column, Double.parseDouble(v));
|
|
157
|
+
} catch (NumberFormatException e) {
|
|
158
|
+
// TODO support default value
|
|
159
|
+
throw new CsvRecordValidateException(e);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
public void stringColumn(Column column)
|
|
165
|
+
{
|
|
166
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
|
167
|
+
if (v == null) {
|
|
168
|
+
pageBuilder.setNull(column);
|
|
169
|
+
} else {
|
|
170
|
+
pageBuilder.setString(column, v);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
public void timestampColumn(Column column)
|
|
175
|
+
{
|
|
176
|
+
String v = nextColumn(schema, tokenizer, nullStringOrNull);
|
|
177
|
+
if (v == null) {
|
|
178
|
+
pageBuilder.setNull(column);
|
|
179
|
+
} else {
|
|
180
|
+
try {
|
|
181
|
+
pageBuilder.setTimestamp(column, (timestampFormatters.get(column.getIndex()).parse(v)));
|
|
182
|
+
} catch (TimestampParseException e) {
|
|
183
|
+
// TODO support default value
|
|
184
|
+
throw new CsvRecordValidateException(e);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
pageBuilder.addRecord();
|
|
190
|
+
|
|
191
|
+
} catch (Exception e) {
|
|
192
|
+
// TODO logging
|
|
193
|
+
long lineNumber = tokenizer.getCurrentLineNumber();
|
|
194
|
+
String skippedLine = tokenizer.skipCurrentLine();
|
|
195
|
+
log.warn(String.format("Skipped (line %d): %s", lineNumber, skippedLine), e);
|
|
196
|
+
//exec.notice().skippedLine(skippedLine);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
pageBuilder.finish();
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
private static String nextColumn(Schema schema, CsvTokenizer tokenizer, String nullStringOrNull)
|
|
206
|
+
{
|
|
207
|
+
String v = tokenizer.nextColumn();
|
|
208
|
+
if (v == null) {
|
|
209
|
+
throw new RuntimeException(String.format("Expected %d columns but line %d has fewer number of columns",
|
|
210
|
+
schema.getColumnCount(), tokenizer.getCurrentLineNumber()));
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (!v.isEmpty()) {
|
|
214
|
+
if (v.equals(nullStringOrNull)) {
|
|
215
|
+
return null;
|
|
216
|
+
}
|
|
217
|
+
return v;
|
|
218
|
+
} else if (tokenizer.wasQuotedColumn()) {
|
|
219
|
+
return "";
|
|
220
|
+
} else {
|
|
221
|
+
return null;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
static class CsvRecordValidateException
|
|
226
|
+
extends RuntimeException
|
|
227
|
+
{
|
|
228
|
+
CsvRecordValidateException(Throwable cause)
|
|
229
|
+
{
|
|
230
|
+
super(cause);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|