embulk-output-utf8parquet 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/LICENSE.txt +21 -0
- data/build.gradle +91 -0
- data/build/classes/main/org/embulk/output/EmbulkWriteSupport$ParquetColumnVisitor.class +0 -0
- data/build/classes/main/org/embulk/output/EmbulkWriteSupport$SchemaConvertColumnVisitor.class +0 -0
- data/build/classes/main/org/embulk/output/EmbulkWriteSupport$SchemaConvertColumnVisitorWithUTF8.class +0 -0
- data/build/classes/main/org/embulk/output/EmbulkWriteSupport.class +0 -0
- data/build/classes/main/org/embulk/output/EmbulkWriterBuilder.class +0 -0
- data/build/classes/main/org/embulk/output/ParquetOutputPlugin$ParquetTransactionalPageOutput.class +0 -0
- data/build/classes/main/org/embulk/output/ParquetOutputPlugin$PluginTask.class +0 -0
- data/build/classes/main/org/embulk/output/ParquetOutputPlugin$TimestampColumnOption.class +0 -0
- data/build/classes/main/org/embulk/output/ParquetOutputPlugin.class +0 -0
- data/build/classes/test/org/embulk/output/ParquetOutputPluginTest.class +0 -0
- data/build/gemspec +19 -0
- data/build/libs/embulk-output-utf8parquet-1.0.0.jar +0 -0
- data/build/libs/embulk-output-utf8parquet-1.0.1.jar +0 -0
- data/build/reports/checkstyle/main.html +119 -0
- data/build/reports/checkstyle/main.xml +9 -0
- data/build/reports/checkstyle/test.html +99 -0
- data/build/reports/checkstyle/test.xml +5 -0
- data/build/reports/tests/test/classes/org.embulk.output.ParquetOutputPluginTest.html +106 -0
- data/build/reports/tests/test/css/base-style.css +179 -0
- data/build/reports/tests/test/css/style.css +84 -0
- data/build/reports/tests/test/index.html +132 -0
- data/build/reports/tests/test/js/report.js +194 -0
- data/build/reports/tests/test/packages/org.embulk.output.html +103 -0
- data/build/test-results/test/TEST-org.embulk.output.ParquetOutputPluginTest.xml +9 -0
- data/build/test-results/test/binary/output.bin +0 -0
- data/build/test-results/test/binary/output.bin.idx +0 -0
- data/build/test-results/test/binary/results.bin +0 -0
- data/build/tmp/jar/MANIFEST.MF +2 -0
- data/classpath/embulk-output-utf8parquet-1.0.2.jar +0 -0
- data/config/checkstyle/checkstyle.xml +128 -0
- data/config/checkstyle/default.xml +108 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk/output/utf8parquet.rb +3 -0
- data/src/main/java/org/embulk/output/EmbulkWriteSupport.java +215 -0
- data/src/main/java/org/embulk/output/EmbulkWriterBuilder.java +37 -0
- data/src/main/java/org/embulk/output/ParquetOutputPlugin.java +236 -0
- data/src/test/java/org/embulk/output/ParquetOutputPluginTest.java +70 -0
- metadata +115 -73
data/gradlew.bat
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
12
|
+
set DEFAULT_JVM_OPTS=
|
13
|
+
|
14
|
+
set DIRNAME=%~dp0
|
15
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
16
|
+
set APP_BASE_NAME=%~n0
|
17
|
+
set APP_HOME=%DIRNAME%
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windowz variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
if "%@eval[2+2]" == "4" goto 4NT_args
|
53
|
+
|
54
|
+
:win9xME_args
|
55
|
+
@rem Slurp the command line arguments.
|
56
|
+
set CMD_LINE_ARGS=
|
57
|
+
set _SKIP=2
|
58
|
+
|
59
|
+
:win9xME_args_slurp
|
60
|
+
if "x%~1" == "x" goto execute
|
61
|
+
|
62
|
+
set CMD_LINE_ARGS=%*
|
63
|
+
goto execute
|
64
|
+
|
65
|
+
:4NT_args
|
66
|
+
@rem Get arguments from the 4NT Shell from JP Software
|
67
|
+
set CMD_LINE_ARGS=%$
|
68
|
+
|
69
|
+
:execute
|
70
|
+
@rem Setup the command line
|
71
|
+
|
72
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
73
|
+
|
74
|
+
@rem Execute Gradle
|
75
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
76
|
+
|
77
|
+
:end
|
78
|
+
@rem End local scope for the variables with windows NT shell
|
79
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
80
|
+
|
81
|
+
:fail
|
82
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
83
|
+
rem the _cmd.exe /c_ return code!
|
84
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
85
|
+
exit /b 1
|
86
|
+
|
87
|
+
:mainEnd
|
88
|
+
if "%OS%"=="Windows_NT" endlocal
|
89
|
+
|
90
|
+
:omega
|
@@ -0,0 +1,215 @@
|
|
1
|
+
package org.embulk.output;
|
2
|
+
|
3
|
+
import org.apache.hadoop.conf.Configuration;
|
4
|
+
import org.apache.parquet.hadoop.api.WriteSupport;
|
5
|
+
import org.apache.parquet.io.api.Binary;
|
6
|
+
import org.apache.parquet.io.api.RecordConsumer;
|
7
|
+
import org.apache.parquet.schema.MessageType;
|
8
|
+
import org.apache.parquet.schema.OriginalType;
|
9
|
+
import org.apache.parquet.schema.PrimitiveType;
|
10
|
+
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
|
11
|
+
import org.apache.parquet.schema.Type;
|
12
|
+
import org.embulk.spi.Column;
|
13
|
+
import org.embulk.spi.ColumnVisitor;
|
14
|
+
import org.embulk.spi.PageReader;
|
15
|
+
import org.embulk.spi.Schema;
|
16
|
+
import org.embulk.spi.time.Timestamp;
|
17
|
+
import org.embulk.spi.time.TimestampFormatter;
|
18
|
+
|
19
|
+
import java.util.ArrayList;
|
20
|
+
import java.util.HashMap;
|
21
|
+
import java.util.List;
|
22
|
+
import java.util.Map;
|
23
|
+
|
24
|
+
public class EmbulkWriteSupport
|
25
|
+
extends WriteSupport<PageReader>
|
26
|
+
{
|
27
|
+
final Schema schema;
|
28
|
+
RecordConsumer consumer;
|
29
|
+
WriteContext writeContext;
|
30
|
+
TimestampFormatter[] timestampFormatters;
|
31
|
+
boolean addUTF8;
|
32
|
+
|
33
|
+
public EmbulkWriteSupport(Schema schema, TimestampFormatter[] timestampFormatters, boolean addUTF8)
|
34
|
+
{
|
35
|
+
this.schema = schema;
|
36
|
+
this.timestampFormatters = timestampFormatters;
|
37
|
+
this.addUTF8 = addUTF8;
|
38
|
+
}
|
39
|
+
|
40
|
+
@Override
|
41
|
+
public WriteContext init(Configuration configuration)
|
42
|
+
{
|
43
|
+
if (writeContext == null) {
|
44
|
+
init();
|
45
|
+
}
|
46
|
+
return writeContext;
|
47
|
+
}
|
48
|
+
|
49
|
+
@Override
|
50
|
+
public void prepareForWrite(RecordConsumer recordConsumer)
|
51
|
+
{
|
52
|
+
this.consumer = recordConsumer;
|
53
|
+
}
|
54
|
+
|
55
|
+
@Override
|
56
|
+
public void write(PageReader record)
|
57
|
+
{
|
58
|
+
final ColumnVisitor visitor = new ParquetColumnVisitor(record, consumer);
|
59
|
+
consumer.startMessage();
|
60
|
+
for (Column c : schema.getColumns()) {
|
61
|
+
if (!record.isNull(c)) {
|
62
|
+
consumer.startField(c.getName(), c.getIndex());
|
63
|
+
c.visit(visitor);
|
64
|
+
consumer.endField(c.getName(), c.getIndex());
|
65
|
+
}
|
66
|
+
}
|
67
|
+
consumer.endMessage();
|
68
|
+
}
|
69
|
+
|
70
|
+
private void init()
|
71
|
+
{
|
72
|
+
MessageType messageType = convertSchema(schema);
|
73
|
+
Map<String, String> metadata = new HashMap<>();
|
74
|
+
writeContext = new WriteContext(messageType, metadata);
|
75
|
+
}
|
76
|
+
|
77
|
+
private MessageType convertSchema(Schema schema)
|
78
|
+
{
|
79
|
+
SchemaConvertColumnVisitor visitor = null;
|
80
|
+
if (addUTF8) {
|
81
|
+
visitor = new SchemaConvertColumnVisitorWithUTF8();
|
82
|
+
}
|
83
|
+
else {
|
84
|
+
visitor = new SchemaConvertColumnVisitor();
|
85
|
+
}
|
86
|
+
schema.visitColumns(visitor);
|
87
|
+
String messageName = "embulk";
|
88
|
+
return new MessageType(messageName, visitor.getConvertedFields());
|
89
|
+
}
|
90
|
+
|
91
|
+
class ParquetColumnVisitor
|
92
|
+
implements ColumnVisitor
|
93
|
+
{
|
94
|
+
final PageReader record;
|
95
|
+
final RecordConsumer consumer;
|
96
|
+
|
97
|
+
public ParquetColumnVisitor(PageReader record, RecordConsumer consumer)
|
98
|
+
{
|
99
|
+
this.record = record;
|
100
|
+
this.consumer = consumer;
|
101
|
+
}
|
102
|
+
|
103
|
+
@Override
|
104
|
+
public void booleanColumn(Column column)
|
105
|
+
{
|
106
|
+
if (!record.isNull(column)) {
|
107
|
+
consumer.addBoolean(record.getBoolean(column));
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
@Override
|
112
|
+
public void longColumn(Column column)
|
113
|
+
{
|
114
|
+
if (!record.isNull(column)) {
|
115
|
+
consumer.addLong(record.getLong(column));
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
@Override
|
120
|
+
public void doubleColumn(Column column)
|
121
|
+
{
|
122
|
+
if (!record.isNull(column)) {
|
123
|
+
consumer.addDouble(record.getDouble(column));
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
@Override
|
128
|
+
public void stringColumn(Column column)
|
129
|
+
{
|
130
|
+
if (!record.isNull(column)) {
|
131
|
+
consumer.addBinary(Binary.fromString(record.getString(column)));
|
132
|
+
}
|
133
|
+
}
|
134
|
+
|
135
|
+
@Override
|
136
|
+
public void jsonColumn(Column column)
|
137
|
+
{
|
138
|
+
throw new UnsupportedOperationException("This plugin doesn't support json type. Please try to upgrade version of the plugin using 'embulk gem update' command. If the latest version still doesn't support json type, please contact plugin developers, or change configuration of input plugin not to use json type.");
|
139
|
+
}
|
140
|
+
|
141
|
+
@Override
|
142
|
+
public void timestampColumn(Column column)
|
143
|
+
{
|
144
|
+
if (!record.isNull(column)) {
|
145
|
+
Timestamp t = record.getTimestamp(column);
|
146
|
+
String formatted = timestampFormatters[column.getIndex()].format(t);
|
147
|
+
consumer.addBinary(Binary.fromString(formatted));
|
148
|
+
}
|
149
|
+
}
|
150
|
+
}
|
151
|
+
|
152
|
+
class SchemaConvertColumnVisitor
|
153
|
+
implements ColumnVisitor
|
154
|
+
{
|
155
|
+
List<Type> fields = new ArrayList<>();
|
156
|
+
|
157
|
+
@Override
|
158
|
+
public void booleanColumn(Column column)
|
159
|
+
{
|
160
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BOOLEAN, column.getName()));
|
161
|
+
}
|
162
|
+
|
163
|
+
@Override
|
164
|
+
public void longColumn(Column column)
|
165
|
+
{
|
166
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.INT64, column.getName()));
|
167
|
+
}
|
168
|
+
|
169
|
+
@Override
|
170
|
+
public void doubleColumn(Column column)
|
171
|
+
{
|
172
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.DOUBLE, column.getName()));
|
173
|
+
}
|
174
|
+
|
175
|
+
@Override
|
176
|
+
public void stringColumn(Column column)
|
177
|
+
{
|
178
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName()));
|
179
|
+
}
|
180
|
+
|
181
|
+
@Override
|
182
|
+
public void jsonColumn(Column column)
|
183
|
+
{
|
184
|
+
throw new UnsupportedOperationException("This plugin doesn't support json type. Please try to upgrade version of the plugin using 'embulk gem update' command. If the latest version still doesn't support json type, please contact plugin developers, or change configuration of input plugin not to use json type.");
|
185
|
+
}
|
186
|
+
|
187
|
+
@Override
|
188
|
+
public void timestampColumn(Column column)
|
189
|
+
{
|
190
|
+
// formatted as string
|
191
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName()));
|
192
|
+
}
|
193
|
+
|
194
|
+
public List<Type> getConvertedFields()
|
195
|
+
{
|
196
|
+
return fields;
|
197
|
+
}
|
198
|
+
}
|
199
|
+
|
200
|
+
class SchemaConvertColumnVisitorWithUTF8 extends SchemaConvertColumnVisitor
|
201
|
+
{
|
202
|
+
@Override
|
203
|
+
public void stringColumn(Column column)
|
204
|
+
{
|
205
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName(), OriginalType.UTF8));
|
206
|
+
}
|
207
|
+
|
208
|
+
@Override
|
209
|
+
public void timestampColumn(Column column)
|
210
|
+
{
|
211
|
+
// formatted as string
|
212
|
+
fields.add(new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveTypeName.BINARY, column.getName(), OriginalType.UTF8));
|
213
|
+
}
|
214
|
+
}
|
215
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
package org.embulk.output;
|
2
|
+
|
3
|
+
import org.apache.hadoop.conf.Configuration;
|
4
|
+
import org.apache.hadoop.fs.Path;
|
5
|
+
import org.apache.parquet.hadoop.ParquetWriter;
|
6
|
+
import org.apache.parquet.hadoop.api.WriteSupport;
|
7
|
+
import org.embulk.spi.PageReader;
|
8
|
+
import org.embulk.spi.Schema;
|
9
|
+
import org.embulk.spi.time.TimestampFormatter;
|
10
|
+
|
11
|
+
public class EmbulkWriterBuilder
|
12
|
+
extends ParquetWriter.Builder<PageReader, EmbulkWriterBuilder>
|
13
|
+
{
|
14
|
+
final Schema schema;
|
15
|
+
final TimestampFormatter[] timestampFormatters;
|
16
|
+
final boolean addUTF8;
|
17
|
+
|
18
|
+
public EmbulkWriterBuilder(Path file, Schema schema, TimestampFormatter[] timestampFormatters, boolean addUTF8)
|
19
|
+
{
|
20
|
+
super(file);
|
21
|
+
this.schema = schema;
|
22
|
+
this.timestampFormatters = timestampFormatters;
|
23
|
+
this.addUTF8 = addUTF8;
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
protected EmbulkWriterBuilder self()
|
28
|
+
{
|
29
|
+
return this;
|
30
|
+
}
|
31
|
+
|
32
|
+
@Override
|
33
|
+
protected WriteSupport<PageReader> getWriteSupport(Configuration conf)
|
34
|
+
{
|
35
|
+
return new EmbulkWriteSupport(schema, timestampFormatters, addUTF8);
|
36
|
+
}
|
37
|
+
}
|
@@ -0,0 +1,236 @@
|
|
1
|
+
package org.embulk.output;
|
2
|
+
|
3
|
+
import com.google.common.base.Throwables;
|
4
|
+
import org.apache.hadoop.conf.Configuration;
|
5
|
+
import org.apache.hadoop.fs.LocalFileSystem;
|
6
|
+
import org.apache.hadoop.fs.Path;
|
7
|
+
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
8
|
+
import org.apache.parquet.hadoop.ParquetFileWriter;
|
9
|
+
import org.apache.parquet.hadoop.ParquetWriter;
|
10
|
+
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
11
|
+
import org.embulk.config.Config;
|
12
|
+
import org.embulk.config.ConfigDefault;
|
13
|
+
import org.embulk.config.ConfigDiff;
|
14
|
+
import org.embulk.config.ConfigSource;
|
15
|
+
import org.embulk.config.Task;
|
16
|
+
import org.embulk.config.TaskReport;
|
17
|
+
import org.embulk.config.TaskSource;
|
18
|
+
import org.embulk.spi.Exec;
|
19
|
+
import org.embulk.spi.OutputPlugin;
|
20
|
+
import org.embulk.spi.Page;
|
21
|
+
import org.embulk.spi.PageReader;
|
22
|
+
import org.embulk.spi.Schema;
|
23
|
+
import org.embulk.spi.TransactionalPageOutput;
|
24
|
+
import org.embulk.spi.time.TimestampFormatter;
|
25
|
+
import org.embulk.spi.util.Timestamps;
|
26
|
+
|
27
|
+
import java.io.IOException;
|
28
|
+
import java.util.List;
|
29
|
+
import java.util.Map;
|
30
|
+
|
31
|
+
@SuppressWarnings("unused")
|
32
|
+
public class ParquetOutputPlugin
|
33
|
+
implements OutputPlugin
|
34
|
+
{
|
35
|
+
public interface PluginTask
|
36
|
+
extends Task, TimestampFormatter.Task
|
37
|
+
{
|
38
|
+
@Config("path_prefix")
|
39
|
+
String getPathPrefix();
|
40
|
+
|
41
|
+
@Config("file_ext")
|
42
|
+
@ConfigDefault("\".parquet\"")
|
43
|
+
String getFileNameExtension();
|
44
|
+
|
45
|
+
@Config("sequence_format")
|
46
|
+
@ConfigDefault("\".%03d\"")
|
47
|
+
String getSequenceFormat();
|
48
|
+
|
49
|
+
@Config("block_size")
|
50
|
+
@ConfigDefault("134217728")
|
51
|
+
// 128M
|
52
|
+
int getBlockSize();
|
53
|
+
|
54
|
+
@Config("page_size")
|
55
|
+
@ConfigDefault("1048576")
|
56
|
+
// 1M
|
57
|
+
int getPageSize();
|
58
|
+
|
59
|
+
@Config("compression_codec")
|
60
|
+
@ConfigDefault("\"UNCOMPRESSED\"")
|
61
|
+
String getCompressionCodec();
|
62
|
+
|
63
|
+
@Config("column_options")
|
64
|
+
@ConfigDefault("{}")
|
65
|
+
Map<String, TimestampColumnOption> getColumnOptions();
|
66
|
+
|
67
|
+
@Config("extra_configurations")
|
68
|
+
@ConfigDefault("{}")
|
69
|
+
Map<String, String> getExtraConfigurations();
|
70
|
+
|
71
|
+
@Config("overwrite")
|
72
|
+
@ConfigDefault("false")
|
73
|
+
boolean getOverwrite();
|
74
|
+
|
75
|
+
@Config("addUTF8")
|
76
|
+
@ConfigDefault("false")
|
77
|
+
boolean getAddUTF8();
|
78
|
+
}
|
79
|
+
|
80
|
+
public interface TimestampColumnOption
|
81
|
+
extends Task, TimestampFormatter.TimestampColumnOption
|
82
|
+
{
|
83
|
+
}
|
84
|
+
|
85
|
+
public ConfigDiff transaction(ConfigSource config,
|
86
|
+
Schema schema, int processorCount,
|
87
|
+
OutputPlugin.Control control)
|
88
|
+
{
|
89
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
90
|
+
|
91
|
+
//TODO
|
92
|
+
|
93
|
+
control.run(task.dump());
|
94
|
+
return Exec.newConfigDiff();
|
95
|
+
}
|
96
|
+
|
97
|
+
public ConfigDiff resume(TaskSource taskSource,
|
98
|
+
Schema schema, int processorCount,
|
99
|
+
OutputPlugin.Control control)
|
100
|
+
{
|
101
|
+
throw new UnsupportedOperationException("parquet output plugin does not support resuming");
|
102
|
+
}
|
103
|
+
|
104
|
+
public void cleanup(TaskSource taskSource,
|
105
|
+
Schema schema, int processorCount,
|
106
|
+
List<TaskReport> successTaskReports)
|
107
|
+
{
|
108
|
+
//TODO
|
109
|
+
}
|
110
|
+
|
111
|
+
public TransactionalPageOutput open(TaskSource taskSource, final Schema schema, int processorIndex)
|
112
|
+
{
|
113
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
114
|
+
|
115
|
+
final PageReader reader = new PageReader(schema);
|
116
|
+
final ParquetWriter<PageReader> writer = createWriter(task, schema, processorIndex);
|
117
|
+
|
118
|
+
return new ParquetTransactionalPageOutput(reader, writer);
|
119
|
+
}
|
120
|
+
|
121
|
+
private String buildPath(PluginTask task, int processorIndex)
|
122
|
+
{
|
123
|
+
final String pathPrefix = task.getPathPrefix();
|
124
|
+
final String pathSuffix = task.getFileNameExtension();
|
125
|
+
final String sequenceFormat = task.getSequenceFormat();
|
126
|
+
return pathPrefix + String.format(sequenceFormat, processorIndex) + pathSuffix;
|
127
|
+
}
|
128
|
+
|
129
|
+
private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
|
130
|
+
{
|
131
|
+
final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
|
132
|
+
final boolean addUTF8 = task.getAddUTF8();
|
133
|
+
final Path path = new Path(buildPath(task, processorIndex));
|
134
|
+
final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
|
135
|
+
final int blockSize = task.getBlockSize();
|
136
|
+
final int pageSize = task.getPageSize();
|
137
|
+
final Configuration conf = createConfiguration(task.getExtraConfigurations());
|
138
|
+
final boolean overwrite = task.getOverwrite();
|
139
|
+
|
140
|
+
ParquetWriter<PageReader> writer = null;
|
141
|
+
try {
|
142
|
+
EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters, addUTF8)
|
143
|
+
.withCompressionCodec(codec)
|
144
|
+
.withRowGroupSize(blockSize)
|
145
|
+
.withPageSize(pageSize)
|
146
|
+
.withDictionaryPageSize(pageSize)
|
147
|
+
.withConf(conf);
|
148
|
+
|
149
|
+
if (overwrite) {
|
150
|
+
builder.withWriteMode(ParquetFileWriter.Mode.OVERWRITE);
|
151
|
+
}
|
152
|
+
|
153
|
+
writer = builder.build();
|
154
|
+
}
|
155
|
+
catch (IOException e) {
|
156
|
+
Throwables.propagate(e);
|
157
|
+
}
|
158
|
+
return writer;
|
159
|
+
}
|
160
|
+
|
161
|
+
private Configuration createConfiguration(Map<String, String> extra)
|
162
|
+
{
|
163
|
+
Configuration conf = new Configuration();
|
164
|
+
|
165
|
+
// Default values
|
166
|
+
conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
|
167
|
+
conf.set("fs.file.impl", LocalFileSystem.class.getName());
|
168
|
+
|
169
|
+
// Optional values
|
170
|
+
for (Map.Entry<String, String> entry : extra.entrySet()) {
|
171
|
+
conf.set(entry.getKey(), entry.getValue());
|
172
|
+
}
|
173
|
+
|
174
|
+
conf.setClassLoader(this.getClass().getClassLoader());
|
175
|
+
|
176
|
+
return conf;
|
177
|
+
}
|
178
|
+
|
179
|
+
class ParquetTransactionalPageOutput
|
180
|
+
implements TransactionalPageOutput
|
181
|
+
{
|
182
|
+
private PageReader reader;
|
183
|
+
private ParquetWriter<PageReader> writer;
|
184
|
+
|
185
|
+
public ParquetTransactionalPageOutput(PageReader reader, ParquetWriter<PageReader> writer)
|
186
|
+
{
|
187
|
+
this.reader = reader;
|
188
|
+
this.writer = writer;
|
189
|
+
}
|
190
|
+
|
191
|
+
@Override
|
192
|
+
public void add(Page page)
|
193
|
+
{
|
194
|
+
try {
|
195
|
+
reader.setPage(page);
|
196
|
+
while (reader.nextRecord()) {
|
197
|
+
writer.write(reader);
|
198
|
+
}
|
199
|
+
}
|
200
|
+
catch (IOException e) {
|
201
|
+
Throwables.propagate(e);
|
202
|
+
}
|
203
|
+
}
|
204
|
+
|
205
|
+
@Override
|
206
|
+
public void finish()
|
207
|
+
{
|
208
|
+
try {
|
209
|
+
writer.close();
|
210
|
+
writer = null;
|
211
|
+
}
|
212
|
+
catch (IOException e) {
|
213
|
+
Throwables.propagate(e);
|
214
|
+
}
|
215
|
+
}
|
216
|
+
|
217
|
+
@Override
|
218
|
+
public void close()
|
219
|
+
{
|
220
|
+
//TODO
|
221
|
+
}
|
222
|
+
|
223
|
+
@Override
|
224
|
+
public void abort()
|
225
|
+
{
|
226
|
+
//TODO
|
227
|
+
}
|
228
|
+
|
229
|
+
@Override
|
230
|
+
public TaskReport commit()
|
231
|
+
{
|
232
|
+
return Exec.newTaskReport();
|
233
|
+
//TODO
|
234
|
+
}
|
235
|
+
}
|
236
|
+
}
|