embulk 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,355 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Preconditions;
4
+ import java.util.List;
5
+ import java.util.ArrayList;
6
+ import java.util.Deque;
7
+ import java.util.ArrayDeque;
8
+ import java.util.Iterator;
9
+ import org.embulk.spi.util.LineDecoder;
10
+
11
+ public class CsvTokenizer
12
+ {
13
+ static enum RecordState
14
+ {
15
+ NOT_END, END,
16
+ }
17
+
18
+ static enum ColumnState
19
+ {
20
+ BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
21
+ }
22
+
23
+ private static final char END_OF_LINE = '\0';
24
+ private static final boolean TRACE = false;
25
+
26
+ private final char delimiter;
27
+ private final char quote;
28
+ private final char escape;
29
+ private final String newline;
30
+ private final boolean trimIfNotQuoted;
31
+ private final long maxQuotedSizeLimit; // TODO not used yet
32
+ private final LineDecoder input;
33
+
34
+ private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
35
+ private long lineNumber = 0;
36
+
37
+ private String line = null;
38
+ private int linePos = 0;
39
+ private boolean wasQuotedColumn = false;
40
+ private List<String> quotedValueLines = new ArrayList<>();
41
+ private Deque<String> unreadLines = new ArrayDeque<>();
42
+
43
+ public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
44
+ {
45
+ delimiter = task.getDelimiterChar();
46
+ quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
47
+ escape = task.getEscapeChar();
48
+ newline = task.getNewline().getString();
49
+ trimIfNotQuoted = task.getTrimIfNotQuoted();
50
+ maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
51
+ this.input = input;
52
+ }
53
+
54
+ public long getCurrentLineNumber()
55
+ {
56
+ return lineNumber;
57
+ }
58
+
59
+ // returns skipped line
60
+ public String skipCurrentLine()
61
+ {
62
+ String skippedLine;
63
+ if (quotedValueLines.isEmpty()) {
64
+ skippedLine = line;
65
+ } else {
66
+ // recover lines of quoted value
67
+ skippedLine = quotedValueLines.remove(0); // TODO optimize performance
68
+ unreadLines.addAll(quotedValueLines);
69
+ unreadLines.add(line);
70
+ lineNumber -= quotedValueLines.size();
71
+ quotedValueLines.clear();
72
+ }
73
+ recordState = RecordState.END;
74
+ return line;
75
+ }
76
+
77
+ public boolean nextFile()
78
+ {
79
+ return input.nextFile();
80
+ }
81
+
82
+ public boolean nextRecord()
83
+ {
84
+ // If at the end of record, read the next line and initialize the state
85
+ Preconditions.checkState(recordState == RecordState.END, "too many columns"); // TODO exception class
86
+ boolean hasNext = nextLine(true);
87
+ if (hasNext) {
88
+ recordState = RecordState.NOT_END;
89
+ return true;
90
+ } else {
91
+ return false;
92
+ }
93
+ }
94
+
95
+ private boolean nextLine(boolean ignoreEmptyLine)
96
+ {
97
+ while (true) {
98
+ if (!unreadLines.isEmpty()) {
99
+ line = unreadLines.removeFirst();
100
+ } else {
101
+ line = input.poll();
102
+ if (line == null) {
103
+ return false;
104
+ }
105
+ }
106
+ linePos = 0;
107
+ lineNumber++;
108
+
109
+ if (TRACE) {
110
+ System.out.println("#MN line: " + line + " (" + lineNumber + ")");
111
+ }
112
+
113
+ if (!line.isEmpty() || !ignoreEmptyLine) {
114
+ return true;
115
+ }
116
+ }
117
+ }
118
+
119
+ public String nextColumn()
120
+ {
121
+ Preconditions.checkState(recordState == RecordState.NOT_END, "doesn't have enough columns"); // TODO exception class
122
+
123
+ // reset last state
124
+ wasQuotedColumn = false;
125
+ quotedValueLines.clear();
126
+
127
+ // local state
128
+ int valueStartPos = linePos;
129
+ int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
130
+ StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
131
+ ColumnState columnState = ColumnState.BEGIN;
132
+
133
+ while (true) {
134
+ final char c = nextChar();
135
+ if (TRACE) {
136
+ System.out.println("#MN c: " + c + " (" + columnState + "," + recordState + ")");
137
+ try { Thread.sleep(100); } catch (Exception e) {}
138
+ }
139
+
140
+ switch (columnState) {
141
+ case BEGIN:
142
+ // TODO optimization: state is BEGIN only at the first character of a column.
143
+ // this block can be out of the looop.
144
+ if (isDelimiter(c)) {
145
+ // empty value
146
+ return "";
147
+
148
+ } else if (isEndOfLine(c)) {
149
+ // empty value
150
+ recordState = RecordState.END;
151
+ return "";
152
+
153
+ } else if (isSpace(c) && trimIfNotQuoted) {
154
+ columnState = ColumnState.FIRST_TRIM;
155
+
156
+ } else if (isQuote(c)) {
157
+ valueStartPos = linePos; // == 1
158
+ wasQuotedColumn = true;
159
+ quotedValue = new StringBuilder();
160
+ columnState = ColumnState.QUOTED_VALUE;
161
+
162
+ } else {
163
+ columnState = ColumnState.VALUE;
164
+ }
165
+ break;
166
+
167
+ case FIRST_TRIM:
168
+ if (isDelimiter(c)) {
169
+ // empty value
170
+ return "";
171
+
172
+ } else if (isEndOfLine(c)) {
173
+ // empty value
174
+ recordState = RecordState.END;
175
+ return "";
176
+
177
+ } else if (isQuote(c)) {
178
+ // column has heading spaces and quoted. TODO should this be rejected?
179
+ valueStartPos = linePos;
180
+ wasQuotedColumn = true;
181
+ quotedValue = new StringBuilder();
182
+ columnState = ColumnState.QUOTED_VALUE;
183
+
184
+ } else if (isSpace(c)) {
185
+ // skip this character
186
+
187
+ } else {
188
+ valueStartPos = linePos - 1;
189
+ columnState = ColumnState.VALUE;
190
+ }
191
+ break;
192
+
193
+ case VALUE:
194
+ if (isDelimiter(c)) {
195
+ return line.substring(valueStartPos, linePos - 1);
196
+
197
+ } else if (isEndOfLine(c)) {
198
+ recordState = RecordState.END;
199
+ return line.substring(valueStartPos, linePos);
200
+
201
+ } else if (isSpace(c) && trimIfNotQuoted) {
202
+ valueEndPos = linePos - 1; // this is possibly end of value
203
+ columnState = ColumnState.LAST_TRIM_OR_VALUE;
204
+
205
+ // TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
206
+ //} else if (isQuote(c)) {
207
+ // // In RFC4180, If fields are not enclosed with double quotes, then
208
+ // // double quotes may not appear inside the fields. But they are often
209
+ // // included in the fields. We should care about them later.
210
+
211
+ } else {
212
+ // keep VALUE state
213
+ }
214
+ break;
215
+
216
+ case LAST_TRIM_OR_VALUE:
217
+ if (isDelimiter(c)) {
218
+ return line.substring(valueStartPos, valueEndPos);
219
+
220
+ } else if (isEndOfLine(c)) {
221
+ recordState = RecordState.END;
222
+ return line.substring(valueStartPos, valueEndPos);
223
+
224
+ } else if (isSpace(c)) {
225
+ // keep LAST_TRIM_OR_VALUE state
226
+
227
+ } else {
228
+ // this spaces are not trailing spaces. go back to VALUE state
229
+ columnState = ColumnState.BEGIN;
230
+ }
231
+ break;
232
+
233
+ case QUOTED_VALUE:
234
+ if (isEndOfLine(c)) {
235
+ // multi-line quoted value
236
+ quotedValue.append(line.substring(valueStartPos, linePos));
237
+ quotedValue.append(newline);
238
+ quotedValueLines.add(line);
239
+ if (!nextLine(false)) {
240
+ throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
241
+ }
242
+ valueStartPos = 0;
243
+
244
+ } else if (isQuote(c)) {
245
+ char next = peekNextChar();
246
+ if (TRACE) {
247
+ System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
248
+ }
249
+ if (isQuote(next)) { // escaped quote
250
+ quotedValue.append(line.substring(valueStartPos, linePos));
251
+ valueStartPos = ++linePos;
252
+ } else {
253
+ quotedValue.append(line.substring(valueStartPos, linePos - 1));
254
+ columnState = ColumnState.AFTER_QUOTED_VALUE;
255
+ }
256
+
257
+ } else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
258
+ // In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
259
+ char next = peekNextChar();
260
+ if (TRACE) {
261
+ System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
262
+ }
263
+ if (isEndOfLine(c)) {
264
+ // escape end of line. TODO assuming multi-line quoted value without newline?
265
+ quotedValue.append(line.substring(valueStartPos, linePos));
266
+ quotedValueLines.add(line);
267
+ if (!nextLine(false)) {
268
+ throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
269
+ }
270
+ valueStartPos = 0;
271
+ } else if (isQuote(next) || isEscape(next)) { // escaped quote
272
+ quotedValue.append(line.substring(valueStartPos, linePos - 1));
273
+ quotedValue.append(next);
274
+ valueStartPos = ++linePos;
275
+ }
276
+
277
+ } else {
278
+ // keep QUOTED_VALUE state
279
+ }
280
+ break;
281
+
282
+ case AFTER_QUOTED_VALUE:
283
+ if (isDelimiter(c)) {
284
+ return quotedValue.toString();
285
+
286
+ } else if (isEndOfLine(c)) {
287
+ recordState = RecordState.END;
288
+ return quotedValue.toString();
289
+
290
+ } else if (isSpace(c)) {
291
+ // column has trailing spaces and quoted. TODO should this be rejected?
292
+
293
+ } else {
294
+ throw new RuntimeException("Unexpected extra character after quoted value"); // TODO exception class
295
+ }
296
+ break;
297
+
298
+ default:
299
+ assert false;
300
+ }
301
+ }
302
+ }
303
+
304
+ public boolean wasQuotedColumn()
305
+ {
306
+ return wasQuotedColumn;
307
+ }
308
+
309
+ private char nextChar()
310
+ {
311
+ Preconditions.checkState(line != null, "nextColumn is called after end of file");
312
+
313
+ if (linePos >= line.length()) {
314
+ return END_OF_LINE;
315
+ } else {
316
+ return line.charAt(linePos++);
317
+ }
318
+ }
319
+
320
+ private char peekNextChar()
321
+ {
322
+ Preconditions.checkState(line != null, "peekNextChar is called after end of file");
323
+
324
+ if (linePos >= line.length()) {
325
+ return END_OF_LINE;
326
+ } else {
327
+ return line.charAt(linePos);
328
+ }
329
+ }
330
+
331
+ private boolean isSpace(char c)
332
+ {
333
+ return c == ' ';
334
+ }
335
+
336
+ private boolean isDelimiter(char c)
337
+ {
338
+ return c == delimiter;
339
+ }
340
+
341
+ private boolean isEndOfLine(char c)
342
+ {
343
+ return c == END_OF_LINE;
344
+ }
345
+
346
+ private boolean isQuote(char c)
347
+ {
348
+ return c == quote;
349
+ }
350
+
351
+ private boolean isEscape(char c)
352
+ {
353
+ return c == escape;
354
+ }
355
+ }
@@ -0,0 +1,55 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.io.InputStream;
4
+ import java.io.IOException;
5
+ import java.util.zip.GZIPInputStream;
6
+ import com.fasterxml.jackson.annotation.JacksonInject;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.spi.DecoderPlugin;
11
+ import org.embulk.spi.BufferAllocator;
12
+ import org.embulk.spi.FileInput;
13
+ import org.embulk.spi.util.FileInputInputStream;
14
+ import org.embulk.spi.util.InputStreamFileInput;
15
+
16
+ public class GzipFileDecoderPlugin
17
+ implements DecoderPlugin
18
+ {
19
+ public interface PluginTask
20
+ extends Task
21
+ {
22
+ @JacksonInject
23
+ public BufferAllocator getBufferAllocator();
24
+ }
25
+
26
+ @Override
27
+ public void transaction(ConfigSource config, DecoderPlugin.Control control)
28
+ {
29
+ PluginTask task = config.loadConfig(PluginTask.class);
30
+ control.run(task.dump());
31
+ }
32
+
33
+ @Override
34
+ public FileInput open(TaskSource taskSource, FileInput input)
35
+ {
36
+ PluginTask task = taskSource.loadTask(PluginTask.class);
37
+ final FileInputInputStream files = new FileInputInputStream(input);
38
+ return new InputStreamFileInput(
39
+ task.getBufferAllocator(),
40
+ new InputStreamFileInput.Provider() {
41
+ public InputStream openNext() throws IOException
42
+ {
43
+ if (!files.nextFile()) {
44
+ return null;
45
+ }
46
+ return new GZIPInputStream(files);
47
+ }
48
+
49
+ public void close() throws IOException
50
+ {
51
+ files.close();
52
+ }
53
+ });
54
+ }
55
+ }
@@ -0,0 +1,39 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.io.OutputStream;
4
+ import java.io.IOException;
5
+ import java.util.zip.GZIPOutputStream;
6
+ import org.embulk.config.Task;
7
+ import org.embulk.config.Config;
8
+ import org.embulk.config.ConfigDefault;
9
+ import org.embulk.config.TaskSource;
10
+ import org.embulk.config.ConfigSource;
11
+ import org.embulk.spi.EncoderPlugin;
12
+ import org.embulk.spi.FileOutput;
13
+ import org.embulk.spi.util.FileOutputOutputStream;
14
+
15
+ public class GzipFileEncoderPlugin
16
+ implements EncoderPlugin
17
+ {
18
+ public interface PluginTask
19
+ extends Task
20
+ {
21
+ @Config("level")
22
+ @ConfigDefault("6")
23
+ public int getLevel();
24
+ }
25
+
26
+ public void transaction(ConfigSource config, EncoderPlugin.Control control)
27
+ {
28
+ PluginTask task = config.loadConfig(PluginTask.class);
29
+ control.run(task.dump());
30
+ }
31
+
32
+ @Override
33
+ public FileOutput open(TaskSource taskSource, FileOutput fileOutput)
34
+ {
35
+ throw new AssertionError("OutputStreamFileOutput is not implemented yet");
36
+ // TODO GZIPOutputStream doesn't support level option?
37
+ //return new OutputStreamFileOutput(new GZIPOutputStream(new FileOutputOutputStream(fileOutput)));
38
+ }
39
+ }