embulk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +13 -0
  3. data/Gemfile +3 -0
  4. data/Gemfile.lock +33 -0
  5. data/README.md +117 -0
  6. data/Rakefile +58 -0
  7. data/bin/embulk +63 -0
  8. data/build.gradle +149 -0
  9. data/embulk-cli/build.gradle +6 -0
  10. data/embulk-cli/pom.xml +94 -0
  11. data/embulk-cli/src/main/java/org/embulk/cli/Main.java +15 -0
  12. data/embulk-core/build.gradle +6 -0
  13. data/embulk-core/pom.xml +143 -0
  14. data/embulk-core/src/main/java/org/embulk/EmbulkService.java +39 -0
  15. data/embulk-core/src/main/java/org/embulk/command/Runner.java +199 -0
  16. data/embulk-core/src/main/java/org/embulk/command/TablePrinter.java +119 -0
  17. data/embulk-core/src/main/java/org/embulk/config/CommitReport.java +26 -0
  18. data/embulk-core/src/main/java/org/embulk/config/Config.java +15 -0
  19. data/embulk-core/src/main/java/org/embulk/config/ConfigDefault.java +15 -0
  20. data/embulk-core/src/main/java/org/embulk/config/ConfigException.java +20 -0
  21. data/embulk-core/src/main/java/org/embulk/config/ConfigLoader.java +83 -0
  22. data/embulk-core/src/main/java/org/embulk/config/ConfigSource.java +28 -0
  23. data/embulk-core/src/main/java/org/embulk/config/DataSource.java +35 -0
  24. data/embulk-core/src/main/java/org/embulk/config/DataSourceImpl.java +208 -0
  25. data/embulk-core/src/main/java/org/embulk/config/DataSourceSerDe.java +80 -0
  26. data/embulk-core/src/main/java/org/embulk/config/GenericTypeReference.java +20 -0
  27. data/embulk-core/src/main/java/org/embulk/config/ModelManager.java +125 -0
  28. data/embulk-core/src/main/java/org/embulk/config/NextConfig.java +26 -0
  29. data/embulk-core/src/main/java/org/embulk/config/Task.java +10 -0
  30. data/embulk-core/src/main/java/org/embulk/config/TaskInvocationHandler.java +180 -0
  31. data/embulk-core/src/main/java/org/embulk/config/TaskSerDe.java +343 -0
  32. data/embulk-core/src/main/java/org/embulk/config/TaskSource.java +28 -0
  33. data/embulk-core/src/main/java/org/embulk/config/TaskValidationException.java +37 -0
  34. data/embulk-core/src/main/java/org/embulk/config/TaskValidator.java +24 -0
  35. data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +45 -0
  36. data/embulk-core/src/main/java/org/embulk/exec/ExecuteInterruptedException.java +10 -0
  37. data/embulk-core/src/main/java/org/embulk/exec/ExecuteResult.java +19 -0
  38. data/embulk-core/src/main/java/org/embulk/exec/ExtensionServiceLoaderModule.java +43 -0
  39. data/embulk-core/src/main/java/org/embulk/exec/ForSystemConfig.java +16 -0
  40. data/embulk-core/src/main/java/org/embulk/exec/GuessExecutor.java +307 -0
  41. data/embulk-core/src/main/java/org/embulk/exec/LocalExecutor.java +274 -0
  42. data/embulk-core/src/main/java/org/embulk/exec/LoggerProvider.java +30 -0
  43. data/embulk-core/src/main/java/org/embulk/exec/NoSampleException.java +10 -0
  44. data/embulk-core/src/main/java/org/embulk/exec/PooledBufferAllocator.java +58 -0
  45. data/embulk-core/src/main/java/org/embulk/exec/PreviewExecutor.java +138 -0
  46. data/embulk-core/src/main/java/org/embulk/exec/PreviewResult.java +27 -0
  47. data/embulk-core/src/main/java/org/embulk/exec/PreviewedNoticeError.java +17 -0
  48. data/embulk-core/src/main/java/org/embulk/exec/SamplingParserPlugin.java +116 -0
  49. data/embulk-core/src/main/java/org/embulk/exec/SystemConfigModule.java +24 -0
  50. data/embulk-core/src/main/java/org/embulk/jruby/JRubyPluginSource.java +69 -0
  51. data/embulk-core/src/main/java/org/embulk/jruby/JRubyScriptingModule.java +100 -0
  52. data/embulk-core/src/main/java/org/embulk/plugin/BuiltinPluginSourceModule.java +17 -0
  53. data/embulk-core/src/main/java/org/embulk/plugin/InjectedPluginSource.java +92 -0
  54. data/embulk-core/src/main/java/org/embulk/plugin/PluginManager.java +34 -0
  55. data/embulk-core/src/main/java/org/embulk/plugin/PluginSource.java +6 -0
  56. data/embulk-core/src/main/java/org/embulk/plugin/PluginSourceNotMatchException.java +19 -0
  57. data/embulk-core/src/main/java/org/embulk/plugin/PluginType.java +47 -0
  58. data/embulk-core/src/main/java/org/embulk/plugin/SetThreadContextClassLoader.java +19 -0
  59. data/embulk-core/src/main/java/org/embulk/spi/Buffer.java +113 -0
  60. data/embulk-core/src/main/java/org/embulk/spi/BufferAllocator.java +8 -0
  61. data/embulk-core/src/main/java/org/embulk/spi/Column.java +92 -0
  62. data/embulk-core/src/main/java/org/embulk/spi/ColumnConfig.java +79 -0
  63. data/embulk-core/src/main/java/org/embulk/spi/DecoderPlugin.java +16 -0
  64. data/embulk-core/src/main/java/org/embulk/spi/EncoderPlugin.java +16 -0
  65. data/embulk-core/src/main/java/org/embulk/spi/Exec.java +76 -0
  66. data/embulk-core/src/main/java/org/embulk/spi/ExecAction.java +6 -0
  67. data/embulk-core/src/main/java/org/embulk/spi/ExecSession.java +105 -0
  68. data/embulk-core/src/main/java/org/embulk/spi/Extension.java +42 -0
  69. data/embulk-core/src/main/java/org/embulk/spi/FileInput.java +11 -0
  70. data/embulk-core/src/main/java/org/embulk/spi/FileInputPlugin.java +19 -0
  71. data/embulk-core/src/main/java/org/embulk/spi/FileInputRunner.java +113 -0
  72. data/embulk-core/src/main/java/org/embulk/spi/FileOutput.java +13 -0
  73. data/embulk-core/src/main/java/org/embulk/spi/FileOutputPlugin.java +20 -0
  74. data/embulk-core/src/main/java/org/embulk/spi/FileOutputRunner.java +167 -0
  75. data/embulk-core/src/main/java/org/embulk/spi/FormatterPlugin.java +18 -0
  76. data/embulk-core/src/main/java/org/embulk/spi/GuessPlugin.java +9 -0
  77. data/embulk-core/src/main/java/org/embulk/spi/InputPlugin.java +20 -0
  78. data/embulk-core/src/main/java/org/embulk/spi/OutputPlugin.java +21 -0
  79. data/embulk-core/src/main/java/org/embulk/spi/Page.java +45 -0
  80. data/embulk-core/src/main/java/org/embulk/spi/PageBuilder.java +327 -0
  81. data/embulk-core/src/main/java/org/embulk/spi/PageFormat.java +47 -0
  82. data/embulk-core/src/main/java/org/embulk/spi/PageOutput.java +11 -0
  83. data/embulk-core/src/main/java/org/embulk/spi/PageReader.java +227 -0
  84. data/embulk-core/src/main/java/org/embulk/spi/ParserPlugin.java +17 -0
  85. data/embulk-core/src/main/java/org/embulk/spi/Schema.java +101 -0
  86. data/embulk-core/src/main/java/org/embulk/spi/SchemaConfig.java +52 -0
  87. data/embulk-core/src/main/java/org/embulk/spi/SchemaVisitor.java +14 -0
  88. data/embulk-core/src/main/java/org/embulk/spi/Transactional.java +10 -0
  89. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileInput.java +17 -0
  90. data/embulk-core/src/main/java/org/embulk/spi/TransactionalFileOutput.java +19 -0
  91. data/embulk-core/src/main/java/org/embulk/spi/TransactionalPageOutput.java +17 -0
  92. data/embulk-core/src/main/java/org/embulk/spi/time/DateTimeZoneSerDe.java +57 -0
  93. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelper.java +8 -0
  94. data/embulk-core/src/main/java/org/embulk/spi/time/JRubyTimeParserHelperFactory.java +6 -0
  95. data/embulk-core/src/main/java/org/embulk/spi/time/Timestamp.java +159 -0
  96. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormat.java +98 -0
  97. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampFormatter.java +55 -0
  98. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParseException.java +6 -0
  99. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampParser.java +60 -0
  100. data/embulk-core/src/main/java/org/embulk/spi/time/TimestampSerDe.java +50 -0
  101. data/embulk-core/src/main/java/org/embulk/spi/type/AbstractType.java +55 -0
  102. data/embulk-core/src/main/java/org/embulk/spi/type/BooleanType.java +12 -0
  103. data/embulk-core/src/main/java/org/embulk/spi/type/DoubleType.java +12 -0
  104. data/embulk-core/src/main/java/org/embulk/spi/type/LongType.java +12 -0
  105. data/embulk-core/src/main/java/org/embulk/spi/type/StringType.java +12 -0
  106. data/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java +39 -0
  107. data/embulk-core/src/main/java/org/embulk/spi/type/Type.java +15 -0
  108. data/embulk-core/src/main/java/org/embulk/spi/type/TypeDeserializer.java +47 -0
  109. data/embulk-core/src/main/java/org/embulk/spi/type/Types.java +14 -0
  110. data/embulk-core/src/main/java/org/embulk/spi/util/CharsetSerDe.java +55 -0
  111. data/embulk-core/src/main/java/org/embulk/spi/util/Decoders.java +81 -0
  112. data/embulk-core/src/main/java/org/embulk/spi/util/Encoders.java +81 -0
  113. data/embulk-core/src/main/java/org/embulk/spi/util/FileInputInputStream.java +110 -0
  114. data/embulk-core/src/main/java/org/embulk/spi/util/FileOutputOutputStream.java +94 -0
  115. data/embulk-core/src/main/java/org/embulk/spi/util/InputStreamFileInput.java +111 -0
  116. data/embulk-core/src/main/java/org/embulk/spi/util/Inputs.java +74 -0
  117. data/embulk-core/src/main/java/org/embulk/spi/util/LineDecoder.java +118 -0
  118. data/embulk-core/src/main/java/org/embulk/spi/util/LineEncoder.java +109 -0
  119. data/embulk-core/src/main/java/org/embulk/spi/util/ListFileInput.java +52 -0
  120. data/embulk-core/src/main/java/org/embulk/spi/util/Newline.java +38 -0
  121. data/embulk-core/src/main/java/org/embulk/spi/util/PagePrinter.java +102 -0
  122. data/embulk-core/src/main/java/org/embulk/spi/util/Pages.java +139 -0
  123. data/embulk-core/src/test/java/org/embulk/EmbulkTestRuntime.java +110 -0
  124. data/embulk-core/src/test/java/org/embulk/GuiceBinder.java +72 -0
  125. data/embulk-core/src/test/java/org/embulk/RandomManager.java +53 -0
  126. data/embulk-core/src/test/java/org/embulk/TestPluginSourceModule.java +23 -0
  127. data/embulk-core/src/test/java/org/embulk/TestUtilityModule.java +17 -0
  128. data/embulk-core/src/test/java/org/embulk/config/TestConfigSource.java +114 -0
  129. data/embulk-core/src/test/java/org/embulk/config/TestTaskSource.java +70 -0
  130. data/embulk-core/src/test/java/org/embulk/plugin/MockPluginSource.java +57 -0
  131. data/embulk-core/src/test/java/org/embulk/plugin/TestPluginType.java +18 -0
  132. data/embulk-core/src/test/java/org/embulk/spi/MockFileOutput.java +63 -0
  133. data/embulk-core/src/test/java/org/embulk/spi/MockFormatterPlugin.java +101 -0
  134. data/embulk-core/src/test/java/org/embulk/spi/MockParserPlugin.java +73 -0
  135. data/embulk-core/src/test/java/org/embulk/spi/PageTestUtils.java +78 -0
  136. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputInputStream.java +67 -0
  137. data/embulk-core/src/test/java/org/embulk/spi/TestFileInputRunner.java +180 -0
  138. data/embulk-core/src/test/java/org/embulk/spi/TestFileOutputRunner.java +192 -0
  139. data/embulk-core/src/test/java/org/embulk/spi/TestInputStreamFileInput.java +188 -0
  140. data/embulk-core/src/test/java/org/embulk/spi/TestPageBuilderReader.java +301 -0
  141. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestamp.java +116 -0
  142. data/embulk-core/src/test/java/org/embulk/spi/time/TestTimestampFormatterParser.java +52 -0
  143. data/embulk-core/src/test/java/org/embulk/spi/type/TestTypeSerDe.java +45 -0
  144. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineDecoder.java +132 -0
  145. data/embulk-core/src/test/java/org/embulk/spi/util/TestLineEncoder.java +123 -0
  146. data/embulk-standards/build.gradle +6 -0
  147. data/embulk-standards/pom.xml +68 -0
  148. data/embulk-standards/src/main/java/org/embulk/standards/CsvFormatterPlugin.java +158 -0
  149. data/embulk-standards/src/main/java/org/embulk/standards/CsvParserPlugin.java +233 -0
  150. data/embulk-standards/src/main/java/org/embulk/standards/CsvTokenizer.java +355 -0
  151. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileDecoderPlugin.java +55 -0
  152. data/embulk-standards/src/main/java/org/embulk/standards/GzipFileEncoderPlugin.java +39 -0
  153. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileInputPlugin.java +138 -0
  154. data/embulk-standards/src/main/java/org/embulk/standards/LocalFileOutputPlugin.java +128 -0
  155. data/embulk-standards/src/main/java/org/embulk/standards/NullOutputPlugin.java +46 -0
  156. data/embulk-standards/src/main/java/org/embulk/standards/S3FileInputPlugin.java +238 -0
  157. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginExtension.java +16 -0
  158. data/embulk-standards/src/main/java/org/embulk/standards/StandardPluginModule.java +44 -0
  159. data/embulk-standards/src/main/java/org/embulk/standards/StdoutOutputPlugin.java +71 -0
  160. data/embulk-standards/src/main/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  161. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvParserPlugin.java +69 -0
  162. data/embulk-standards/src/test/java/org/embulk/standards/TestCsvTokenizer.java +291 -0
  163. data/embulk-standards/src/test/java/org/embulk/standards/TestS3FileInputPlugin.java +43 -0
  164. data/embulk.gemspec +27 -0
  165. data/examples/config.yml +34 -0
  166. data/examples/csv/sample.csv.gz +0 -0
  167. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  168. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  169. data/gradlew +164 -0
  170. data/gradlew.bat +90 -0
  171. data/lib/embulk.rb +16 -0
  172. data/lib/embulk/buffer.rb +17 -0
  173. data/lib/embulk/column.rb +47 -0
  174. data/lib/embulk/command/embulk.rb +39 -0
  175. data/lib/embulk/command/embulk_example.rb +32 -0
  176. data/lib/embulk/command/embulk_generate_bin.rb +62 -0
  177. data/lib/embulk/command/embulk_run.rb +243 -0
  178. data/lib/embulk/data/bundle/.bundle/config +3 -0
  179. data/lib/embulk/data/bundle/Gemfile +31 -0
  180. data/lib/embulk/data/bundle/Gemfile.lock +8 -0
  181. data/lib/embulk/data/bundle/embulk/input_example.rb +40 -0
  182. data/lib/embulk/data/bundle/embulk/output_example.rb +51 -0
  183. data/lib/embulk/data_source.rb +66 -0
  184. data/lib/embulk/error.rb +5 -0
  185. data/lib/embulk/guess_charset.rb +26 -0
  186. data/lib/embulk/guess_csv.rb +195 -0
  187. data/lib/embulk/guess_gzip.rb +18 -0
  188. data/lib/embulk/guess_newline.rb +20 -0
  189. data/lib/embulk/guess_plugin.rb +113 -0
  190. data/lib/embulk/input_plugin.rb +53 -0
  191. data/lib/embulk/java/bootstrap.rb +12 -0
  192. data/lib/embulk/java/imports.rb +26 -0
  193. data/lib/embulk/java/time_helper.rb +77 -0
  194. data/lib/embulk/output_plugin.rb +104 -0
  195. data/lib/embulk/page.rb +28 -0
  196. data/lib/embulk/page_builder.rb +22 -0
  197. data/lib/embulk/plugin.rb +152 -0
  198. data/lib/embulk/plugin_registry.rb +70 -0
  199. data/lib/embulk/schema.rb +85 -0
  200. data/lib/embulk/time_format_guess.rb +331 -0
  201. data/lib/embulk/version.rb +3 -0
  202. data/pom.xml +533 -0
  203. data/settings.gradle +5 -0
  204. metadata +370 -0
@@ -0,0 +1,355 @@
1
+ package org.embulk.standards;
2
+
3
+ import com.google.common.base.Preconditions;
4
+ import java.util.List;
5
+ import java.util.ArrayList;
6
+ import java.util.Deque;
7
+ import java.util.ArrayDeque;
8
+ import java.util.Iterator;
9
+ import org.embulk.spi.util.LineDecoder;
10
+
11
+ public class CsvTokenizer
12
+ {
13
+ static enum RecordState
14
+ {
15
+ NOT_END, END,
16
+ }
17
+
18
+ static enum ColumnState
19
+ {
20
+ BEGIN, VALUE, QUOTED_VALUE, AFTER_QUOTED_VALUE, FIRST_TRIM, LAST_TRIM_OR_VALUE,
21
+ }
22
+
23
+ private static final char END_OF_LINE = '\0';
24
+ private static final boolean TRACE = false;
25
+
26
+ private final char delimiter;
27
+ private final char quote;
28
+ private final char escape;
29
+ private final String newline;
30
+ private final boolean trimIfNotQuoted;
31
+ private final long maxQuotedSizeLimit; // TODO not used yet
32
+ private final LineDecoder input;
33
+
34
+ private RecordState recordState = RecordState.END; // initial state is end of a record. nextRecord() must be called first
35
+ private long lineNumber = 0;
36
+
37
+ private String line = null;
38
+ private int linePos = 0;
39
+ private boolean wasQuotedColumn = false;
40
+ private List<String> quotedValueLines = new ArrayList<>();
41
+ private Deque<String> unreadLines = new ArrayDeque<>();
42
+
43
+ public CsvTokenizer(LineDecoder input, CsvParserPlugin.PluginTask task)
44
+ {
45
+ delimiter = task.getDelimiterChar();
46
+ quote = task.getQuoteChar() != '\0' ? task.getQuoteChar() : '"';
47
+ escape = task.getEscapeChar();
48
+ newline = task.getNewline().getString();
49
+ trimIfNotQuoted = task.getTrimIfNotQuoted();
50
+ maxQuotedSizeLimit = task.getMaxQuotedSizeLimit();
51
+ this.input = input;
52
+ }
53
+
54
+ public long getCurrentLineNumber()
55
+ {
56
+ return lineNumber;
57
+ }
58
+
59
+ // returns skipped line
60
+ public String skipCurrentLine()
61
+ {
62
+ String skippedLine;
63
+ if (quotedValueLines.isEmpty()) {
64
+ skippedLine = line;
65
+ } else {
66
+ // recover lines of quoted value
67
+ skippedLine = quotedValueLines.remove(0); // TODO optimize performance
68
+ unreadLines.addAll(quotedValueLines);
69
+ unreadLines.add(line);
70
+ lineNumber -= quotedValueLines.size();
71
+ quotedValueLines.clear();
72
+ }
73
+ recordState = RecordState.END;
74
+ return line;
75
+ }
76
+
77
+ public boolean nextFile()
78
+ {
79
+ return input.nextFile();
80
+ }
81
+
82
+ public boolean nextRecord()
83
+ {
84
+ // If at the end of record, read the next line and initialize the state
85
+ Preconditions.checkState(recordState == RecordState.END, "too many columns"); // TODO exception class
86
+ boolean hasNext = nextLine(true);
87
+ if (hasNext) {
88
+ recordState = RecordState.NOT_END;
89
+ return true;
90
+ } else {
91
+ return false;
92
+ }
93
+ }
94
+
95
+ private boolean nextLine(boolean ignoreEmptyLine)
96
+ {
97
+ while (true) {
98
+ if (!unreadLines.isEmpty()) {
99
+ line = unreadLines.removeFirst();
100
+ } else {
101
+ line = input.poll();
102
+ if (line == null) {
103
+ return false;
104
+ }
105
+ }
106
+ linePos = 0;
107
+ lineNumber++;
108
+
109
+ if (TRACE) {
110
+ System.out.println("#MN line: " + line + " (" + lineNumber + ")");
111
+ }
112
+
113
+ if (!line.isEmpty() || !ignoreEmptyLine) {
114
+ return true;
115
+ }
116
+ }
117
+ }
118
+
119
+ public String nextColumn()
120
+ {
121
+ Preconditions.checkState(recordState == RecordState.NOT_END, "doesn't have enough columns"); // TODO exception class
122
+
123
+ // reset last state
124
+ wasQuotedColumn = false;
125
+ quotedValueLines.clear();
126
+
127
+ // local state
128
+ int valueStartPos = linePos;
129
+ int valueEndPos = 0; // initialized by VALUE state and used by LAST_TRIM_OR_VALUE and
130
+ StringBuilder quotedValue = null; // initial by VALUE or FIRST_TRIM state and used by QUOTED_VALUE state
131
+ ColumnState columnState = ColumnState.BEGIN;
132
+
133
+ while (true) {
134
+ final char c = nextChar();
135
+ if (TRACE) {
136
+ System.out.println("#MN c: " + c + " (" + columnState + "," + recordState + ")");
137
+ try { Thread.sleep(100); } catch (Exception e) {}
138
+ }
139
+
140
+ switch (columnState) {
141
+ case BEGIN:
142
+ // TODO optimization: state is BEGIN only at the first character of a column.
143
+ // this block can be out of the looop.
144
+ if (isDelimiter(c)) {
145
+ // empty value
146
+ return "";
147
+
148
+ } else if (isEndOfLine(c)) {
149
+ // empty value
150
+ recordState = RecordState.END;
151
+ return "";
152
+
153
+ } else if (isSpace(c) && trimIfNotQuoted) {
154
+ columnState = ColumnState.FIRST_TRIM;
155
+
156
+ } else if (isQuote(c)) {
157
+ valueStartPos = linePos; // == 1
158
+ wasQuotedColumn = true;
159
+ quotedValue = new StringBuilder();
160
+ columnState = ColumnState.QUOTED_VALUE;
161
+
162
+ } else {
163
+ columnState = ColumnState.VALUE;
164
+ }
165
+ break;
166
+
167
+ case FIRST_TRIM:
168
+ if (isDelimiter(c)) {
169
+ // empty value
170
+ return "";
171
+
172
+ } else if (isEndOfLine(c)) {
173
+ // empty value
174
+ recordState = RecordState.END;
175
+ return "";
176
+
177
+ } else if (isQuote(c)) {
178
+ // column has heading spaces and quoted. TODO should this be rejected?
179
+ valueStartPos = linePos;
180
+ wasQuotedColumn = true;
181
+ quotedValue = new StringBuilder();
182
+ columnState = ColumnState.QUOTED_VALUE;
183
+
184
+ } else if (isSpace(c)) {
185
+ // skip this character
186
+
187
+ } else {
188
+ valueStartPos = linePos - 1;
189
+ columnState = ColumnState.VALUE;
190
+ }
191
+ break;
192
+
193
+ case VALUE:
194
+ if (isDelimiter(c)) {
195
+ return line.substring(valueStartPos, linePos - 1);
196
+
197
+ } else if (isEndOfLine(c)) {
198
+ recordState = RecordState.END;
199
+ return line.substring(valueStartPos, linePos);
200
+
201
+ } else if (isSpace(c) && trimIfNotQuoted) {
202
+ valueEndPos = linePos - 1; // this is possibly end of value
203
+ columnState = ColumnState.LAST_TRIM_OR_VALUE;
204
+
205
+ // TODO not implemented yet foo""bar""baz -> [foo, bar, baz].append
206
+ //} else if (isQuote(c)) {
207
+ // // In RFC4180, If fields are not enclosed with double quotes, then
208
+ // // double quotes may not appear inside the fields. But they are often
209
+ // // included in the fields. We should care about them later.
210
+
211
+ } else {
212
+ // keep VALUE state
213
+ }
214
+ break;
215
+
216
+ case LAST_TRIM_OR_VALUE:
217
+ if (isDelimiter(c)) {
218
+ return line.substring(valueStartPos, valueEndPos);
219
+
220
+ } else if (isEndOfLine(c)) {
221
+ recordState = RecordState.END;
222
+ return line.substring(valueStartPos, valueEndPos);
223
+
224
+ } else if (isSpace(c)) {
225
+ // keep LAST_TRIM_OR_VALUE state
226
+
227
+ } else {
228
+ // this spaces are not trailing spaces. go back to VALUE state
229
+ columnState = ColumnState.BEGIN;
230
+ }
231
+ break;
232
+
233
+ case QUOTED_VALUE:
234
+ if (isEndOfLine(c)) {
235
+ // multi-line quoted value
236
+ quotedValue.append(line.substring(valueStartPos, linePos));
237
+ quotedValue.append(newline);
238
+ quotedValueLines.add(line);
239
+ if (!nextLine(false)) {
240
+ throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
241
+ }
242
+ valueStartPos = 0;
243
+
244
+ } else if (isQuote(c)) {
245
+ char next = peekNextChar();
246
+ if (TRACE) {
247
+ System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
248
+ }
249
+ if (isQuote(next)) { // escaped quote
250
+ quotedValue.append(line.substring(valueStartPos, linePos));
251
+ valueStartPos = ++linePos;
252
+ } else {
253
+ quotedValue.append(line.substring(valueStartPos, linePos - 1));
254
+ columnState = ColumnState.AFTER_QUOTED_VALUE;
255
+ }
256
+
257
+ } else if (isEscape(c)) { // isQuote must be checked first in case of quote == escape
258
+ // In RFC 4180, CSV's escape char is '\"'. But '\\' is often used.
259
+ char next = peekNextChar();
260
+ if (TRACE) {
261
+ System.out.println("#MN peeked c: " + next + " (" + columnState + "," + recordState + ")");
262
+ }
263
+ if (isEndOfLine(c)) {
264
+ // escape end of line. TODO assuming multi-line quoted value without newline?
265
+ quotedValue.append(line.substring(valueStartPos, linePos));
266
+ quotedValueLines.add(line);
267
+ if (!nextLine(false)) {
268
+ throw new RuntimeException("Unexpected end of line during parsing a quoted value"); // TODO exception class
269
+ }
270
+ valueStartPos = 0;
271
+ } else if (isQuote(next) || isEscape(next)) { // escaped quote
272
+ quotedValue.append(line.substring(valueStartPos, linePos - 1));
273
+ quotedValue.append(next);
274
+ valueStartPos = ++linePos;
275
+ }
276
+
277
+ } else {
278
+ // keep QUOTED_VALUE state
279
+ }
280
+ break;
281
+
282
+ case AFTER_QUOTED_VALUE:
283
+ if (isDelimiter(c)) {
284
+ return quotedValue.toString();
285
+
286
+ } else if (isEndOfLine(c)) {
287
+ recordState = RecordState.END;
288
+ return quotedValue.toString();
289
+
290
+ } else if (isSpace(c)) {
291
+ // column has trailing spaces and quoted. TODO should this be rejected?
292
+
293
+ } else {
294
+ throw new RuntimeException("Unexpected extra character after quoted value"); // TODO exception class
295
+ }
296
+ break;
297
+
298
+ default:
299
+ assert false;
300
+ }
301
+ }
302
+ }
303
+
304
+ public boolean wasQuotedColumn()
305
+ {
306
+ return wasQuotedColumn;
307
+ }
308
+
309
+ private char nextChar()
310
+ {
311
+ Preconditions.checkState(line != null, "nextColumn is called after end of file");
312
+
313
+ if (linePos >= line.length()) {
314
+ return END_OF_LINE;
315
+ } else {
316
+ return line.charAt(linePos++);
317
+ }
318
+ }
319
+
320
+ private char peekNextChar()
321
+ {
322
+ Preconditions.checkState(line != null, "peekNextChar is called after end of file");
323
+
324
+ if (linePos >= line.length()) {
325
+ return END_OF_LINE;
326
+ } else {
327
+ return line.charAt(linePos);
328
+ }
329
+ }
330
+
331
+ private boolean isSpace(char c)
332
+ {
333
+ return c == ' ';
334
+ }
335
+
336
+ private boolean isDelimiter(char c)
337
+ {
338
+ return c == delimiter;
339
+ }
340
+
341
+ private boolean isEndOfLine(char c)
342
+ {
343
+ return c == END_OF_LINE;
344
+ }
345
+
346
+ private boolean isQuote(char c)
347
+ {
348
+ return c == quote;
349
+ }
350
+
351
+ private boolean isEscape(char c)
352
+ {
353
+ return c == escape;
354
+ }
355
+ }
@@ -0,0 +1,55 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.io.InputStream;
4
+ import java.io.IOException;
5
+ import java.util.zip.GZIPInputStream;
6
+ import com.fasterxml.jackson.annotation.JacksonInject;
7
+ import org.embulk.config.Task;
8
+ import org.embulk.config.TaskSource;
9
+ import org.embulk.config.ConfigSource;
10
+ import org.embulk.spi.DecoderPlugin;
11
+ import org.embulk.spi.BufferAllocator;
12
+ import org.embulk.spi.FileInput;
13
+ import org.embulk.spi.util.FileInputInputStream;
14
+ import org.embulk.spi.util.InputStreamFileInput;
15
+
16
+ public class GzipFileDecoderPlugin
17
+ implements DecoderPlugin
18
+ {
19
+ public interface PluginTask
20
+ extends Task
21
+ {
22
+ @JacksonInject
23
+ public BufferAllocator getBufferAllocator();
24
+ }
25
+
26
+ @Override
27
+ public void transaction(ConfigSource config, DecoderPlugin.Control control)
28
+ {
29
+ PluginTask task = config.loadConfig(PluginTask.class);
30
+ control.run(task.dump());
31
+ }
32
+
33
+ @Override
34
+ public FileInput open(TaskSource taskSource, FileInput input)
35
+ {
36
+ PluginTask task = taskSource.loadTask(PluginTask.class);
37
+ final FileInputInputStream files = new FileInputInputStream(input);
38
+ return new InputStreamFileInput(
39
+ task.getBufferAllocator(),
40
+ new InputStreamFileInput.Provider() {
41
+ public InputStream openNext() throws IOException
42
+ {
43
+ if (!files.nextFile()) {
44
+ return null;
45
+ }
46
+ return new GZIPInputStream(files);
47
+ }
48
+
49
+ public void close() throws IOException
50
+ {
51
+ files.close();
52
+ }
53
+ });
54
+ }
55
+ }
@@ -0,0 +1,39 @@
1
+ package org.embulk.standards;
2
+
3
+ import java.io.OutputStream;
4
+ import java.io.IOException;
5
+ import java.util.zip.GZIPOutputStream;
6
+ import org.embulk.config.Task;
7
+ import org.embulk.config.Config;
8
+ import org.embulk.config.ConfigDefault;
9
+ import org.embulk.config.TaskSource;
10
+ import org.embulk.config.ConfigSource;
11
+ import org.embulk.spi.EncoderPlugin;
12
+ import org.embulk.spi.FileOutput;
13
+ import org.embulk.spi.util.FileOutputOutputStream;
14
+
15
+ public class GzipFileEncoderPlugin
16
+ implements EncoderPlugin
17
+ {
18
+ public interface PluginTask
19
+ extends Task
20
+ {
21
+ @Config("level")
22
+ @ConfigDefault("6")
23
+ public int getLevel();
24
+ }
25
+
26
+ public void transaction(ConfigSource config, EncoderPlugin.Control control)
27
+ {
28
+ PluginTask task = config.loadConfig(PluginTask.class);
29
+ control.run(task.dump());
30
+ }
31
+
32
+ @Override
33
+ public FileOutput open(TaskSource taskSource, FileOutput fileOutput)
34
+ {
35
+ throw new AssertionError("OutputStreamFileOutput is not implemented yet");
36
+ // TODO GZIPOutputStream doesn't support level option?
37
+ //return new OutputStreamFileOutput(new GZIPOutputStream(new FileOutputOutputStream(fileOutput)));
38
+ }
39
+ }