embulk-parser-joni_regexp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +7 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +107 -0
  6. data/build.gradle +101 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/example/apache.txt +5000 -0
  10. data/example/apache.yml +18 -0
  11. data/example/conf.yml +12 -0
  12. data/example/conf2.yml +12 -0
  13. data/example/conf3.yml +12 -0
  14. data/example/seed.yml +8 -0
  15. data/example/test.txt +8 -0
  16. data/example/test2.txt +3 -0
  17. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  18. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  19. data/gradlew +169 -0
  20. data/gradlew.bat +84 -0
  21. data/lib/embulk/guess/joni_regexp.rb +28 -0
  22. data/lib/embulk/parser/joni_regexp.rb +3 -0
  23. data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
  24. data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
  25. data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
  26. data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
  27. data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
  28. data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
  29. data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
  30. data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
  31. data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
  32. data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
  33. data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
  34. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
  35. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
  36. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
  37. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
  38. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
  39. metadata +112 -0
@@ -0,0 +1,47 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ public class LongCast
7
+ {
8
+ private LongCast() {}
9
+
10
+ private static String buildErrorMessage(String as, long value)
11
+ {
12
+ return String.format("cannot cast long to %s: \"%s\"", as, value);
13
+ }
14
+
15
+ public static boolean asBoolean(long value) throws DataException
16
+ {
17
+ if (value == 1) {
18
+ return true;
19
+ }
20
+ else if (value == 0) {
21
+ return false;
22
+ }
23
+ else {
24
+ throw new DataException(buildErrorMessage("boolean", value));
25
+ }
26
+ }
27
+
28
+ public static long asLong(long value) throws DataException
29
+ {
30
+ return value;
31
+ }
32
+
33
+ public static double asDouble(long value) throws DataException
34
+ {
35
+ return (double) value;
36
+ }
37
+
38
+ public static String asString(long value) throws DataException
39
+ {
40
+ return String.valueOf(value);
41
+ }
42
+
43
+ public static Timestamp asTimestamp(long value) throws DataException
44
+ {
45
+ return Timestamp.ofEpochSecond(value);
46
+ }
47
+ }
@@ -0,0 +1,82 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import com.google.common.collect.ImmutableSet;
4
+ import org.embulk.spi.DataException;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParseException;
7
+ import org.embulk.spi.time.TimestampParser;
8
+
9
+ public class StringCast
10
+ {
11
+ // copy from csv plugin
12
+ public static final ImmutableSet<String> TRUE_STRINGS =
13
+ ImmutableSet.of(
14
+ "true", "True", "TRUE",
15
+ "yes", "Yes", "YES",
16
+ "t", "T", "y", "Y",
17
+ "on", "On", "ON",
18
+ "1");
19
+
20
+ public static final ImmutableSet<String> FALSE_STRINGS =
21
+ ImmutableSet.of(
22
+ "false", "False", "FALSE",
23
+ "no", "No", "NO",
24
+ "f", "F", "n", "N",
25
+ "off", "Off", "OFF",
26
+ "0");
27
+
28
+ private StringCast() {}
29
+
30
+ private static String buildErrorMessage(String as, String value)
31
+ {
32
+ return String.format("cannot cast String to %s: \"%s\"", as, value);
33
+ }
34
+
35
+ public static boolean asBoolean(String value) throws DataException
36
+ {
37
+ if (TRUE_STRINGS.contains(value)) {
38
+ return true;
39
+ }
40
+ else if (FALSE_STRINGS.contains(value)) {
41
+ return false;
42
+ }
43
+ else {
44
+ throw new DataException(buildErrorMessage("boolean", value));
45
+ }
46
+ }
47
+
48
+ public static long asLong(String value) throws DataException
49
+ {
50
+ try {
51
+ return Long.parseLong(value);
52
+ }
53
+ catch (NumberFormatException ex) {
54
+ throw new DataException(buildErrorMessage("long", value), ex);
55
+ }
56
+ }
57
+
58
+ public static double asDouble(String value) throws DataException
59
+ {
60
+ try {
61
+ return Double.parseDouble(value);
62
+ }
63
+ catch (NumberFormatException ex) {
64
+ throw new DataException(buildErrorMessage("double", value), ex);
65
+ }
66
+ }
67
+
68
+ public static String asString(String value) throws DataException
69
+ {
70
+ return value;
71
+ }
72
+
73
+ public static Timestamp asTimestamp(String value, TimestampParser parser) throws DataException
74
+ {
75
+ try {
76
+ return parser.parse(value);
77
+ }
78
+ catch (TimestampParseException ex) {
79
+ throw new DataException(buildErrorMessage("timestamp", value), ex);
80
+ }
81
+ }
82
+ }
@@ -0,0 +1,256 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.spi.DataException;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParser;
7
+ import org.joda.time.DateTimeZone;
8
+ import org.jruby.embed.ScriptingContainer;
9
+ import org.junit.Before;
10
+ import org.junit.Rule;
11
+ import org.junit.Test;
12
+ import org.msgpack.value.MapValue;
13
+ import org.msgpack.value.Value;
14
+ import org.msgpack.value.ValueFactory;
15
+
16
+ import static org.junit.Assert.assertEquals;
17
+ import static org.junit.Assert.assertTrue;
18
+ import static org.junit.Assert.fail;
19
+
20
+ public class TestColumnCaster
21
+ {
22
+ @Rule
23
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
24
+ public MapValue mapValue;
25
+ public DataException thrown;
26
+ public ScriptingContainer jruby;
27
+ public TimestampParser parser;
28
+
29
+ @Before
30
+ public void createResource()
31
+ {
32
+ jruby = new ScriptingContainer();
33
+ thrown = new DataException("any");
34
+ Value[] kvs = new Value[2];
35
+ kvs[0] = ValueFactory.newString("k");
36
+ kvs[1] = ValueFactory.newString("v");
37
+ mapValue = ValueFactory.newMap(kvs);
38
+ parser = new TimestampParser(jruby, "%Y-%m-%d %H:%M:%S.%N", DateTimeZone.UTC);
39
+ }
40
+
41
+ @Test
42
+ public void asBooleanFromBoolean()
43
+ {
44
+ assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newBoolean(true)));
45
+ }
46
+
47
+ @Test
48
+ public void asBooleanFromInteger()
49
+ {
50
+ assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newInteger(1)));
51
+ try {
52
+ ColumnCaster.asBoolean(ValueFactory.newInteger(2));
53
+ fail();
54
+ }
55
+ catch (Throwable t) {
56
+ assertTrue(t instanceof DataException);
57
+ }
58
+ }
59
+
60
+ @Test
61
+ public void asBooleanFromFloat()
62
+ {
63
+ try {
64
+ ColumnCaster.asBoolean(ValueFactory.newFloat(1.1));
65
+ fail();
66
+ }
67
+ catch (Throwable t) {
68
+ assertTrue(t instanceof DataException);
69
+ }
70
+ }
71
+
72
+ @Test
73
+ public void asBooleanFromString()
74
+ {
75
+ assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newString("true")));
76
+ try {
77
+ ColumnCaster.asBoolean(ValueFactory.newString("foo"));
78
+ fail();
79
+ }
80
+ catch (Throwable t) {
81
+ assertTrue(t instanceof DataException);
82
+ }
83
+ }
84
+
85
+ @Test
86
+ public void asBooleanFromJson()
87
+ {
88
+ try {
89
+ ColumnCaster.asBoolean(mapValue);
90
+ fail();
91
+ }
92
+ catch (Throwable t) {
93
+ assertTrue(t instanceof DataException);
94
+ }
95
+ }
96
+
97
+ @Test
98
+ public void asLongFromBoolean()
99
+ {
100
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
101
+ }
102
+
103
+ @Test
104
+ public void asLongFromInteger()
105
+ {
106
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
107
+ }
108
+
109
+ @Test
110
+ public void asLongFromFloat()
111
+ {
112
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
113
+ }
114
+
115
+ @Test
116
+ public void asLongFromString()
117
+ {
118
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
119
+ try {
120
+ ColumnCaster.asLong(ValueFactory.newString("foo"));
121
+ fail();
122
+ }
123
+ catch (Throwable t) {
124
+ assertTrue(t instanceof DataException);
125
+ }
126
+ }
127
+
128
+ @Test
129
+ public void asLongFromJson()
130
+ {
131
+ try {
132
+ ColumnCaster.asLong(mapValue);
133
+ fail();
134
+ }
135
+ catch (Throwable t) {
136
+ assertTrue(t instanceof DataException);
137
+ }
138
+ }
139
+
140
+ @Test
141
+ public void asDoubleFromBoolean()
142
+ {
143
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
144
+ }
145
+
146
+ @Test
147
+ public void asDoubleFromInteger()
148
+ {
149
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
150
+ }
151
+
152
+ @Test
153
+ public void asDoubleFromFloat()
154
+ {
155
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
156
+ }
157
+
158
+ @Test
159
+ public void asDoubleFromString()
160
+ {
161
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
162
+ try {
163
+ ColumnCaster.asLong(ValueFactory.newString("foo"));
164
+ fail();
165
+ }
166
+ catch (Throwable t) {
167
+ assertTrue(t instanceof DataException);
168
+ }
169
+ }
170
+
171
+ @Test
172
+ public void asDoubleFromJson()
173
+ {
174
+ try {
175
+ ColumnCaster.asLong(mapValue);
176
+ fail();
177
+ }
178
+ catch (Throwable t) {
179
+ assertTrue(t instanceof DataException);
180
+ }
181
+ }
182
+
183
+ @Test
184
+ public void asStringFromBoolean()
185
+ {
186
+ assertEquals("true", ColumnCaster.asString(ValueFactory.newBoolean(true)));
187
+ }
188
+
189
+ @Test
190
+ public void asStringFromInteger()
191
+ {
192
+ assertEquals("1", ColumnCaster.asString(ValueFactory.newInteger(1)));
193
+ }
194
+
195
+ @Test
196
+ public void asStringFromFloat()
197
+ {
198
+ assertEquals("1.5", ColumnCaster.asString(ValueFactory.newFloat(1.5)));
199
+ }
200
+
201
+ @Test
202
+ public void asStringFromString()
203
+ {
204
+ assertEquals("1", ColumnCaster.asString(ValueFactory.newString("1")));
205
+ }
206
+
207
+ @Test
208
+ public void asStringFromJson()
209
+ {
210
+ assertEquals("{\"k\":\"v\"}", ColumnCaster.asString(mapValue));
211
+ }
212
+
213
+ @Test
214
+ public void asTimestampFromBoolean()
215
+ {
216
+ try {
217
+ ColumnCaster.asTimestamp(ValueFactory.newBoolean(true), parser);
218
+ fail();
219
+ }
220
+ catch (Throwable t) {
221
+ assertTrue(t instanceof DataException);
222
+ }
223
+ }
224
+
225
+ @Test
226
+ public void asTimestampFromInteger()
227
+ {
228
+ assertEquals(1, ColumnCaster.asTimestamp(ValueFactory.newInteger(1), parser).getEpochSecond());
229
+ }
230
+
231
+ @Test
232
+ public void asTimestampFromFloat()
233
+ {
234
+ Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
235
+ assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newFloat(1463084053.5), parser));
236
+ }
237
+
238
+ @Test
239
+ public void asTimestampFromString()
240
+ {
241
+ Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
242
+ assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newString("2016-05-12 20:14:13.5"), parser));
243
+ }
244
+
245
+ @Test
246
+ public void asTimestampFromJson()
247
+ {
248
+ try {
249
+ ColumnCaster.asTimestamp(mapValue, parser);
250
+ fail();
251
+ }
252
+ catch (Throwable t) {
253
+ assertTrue(t instanceof DataException);
254
+ }
255
+ }
256
+ }
@@ -0,0 +1,432 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import com.google.common.collect.Lists;
6
+ import org.embulk.EmbulkTestRuntime;
7
+ import org.embulk.config.ConfigException;
8
+ import org.embulk.config.ConfigSource;
9
+ import org.embulk.config.TaskSource;
10
+ import org.embulk.spi.ColumnConfig;
11
+ import org.embulk.spi.DataException;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FileInput;
14
+ import org.embulk.spi.ParserPlugin;
15
+ import org.embulk.spi.Schema;
16
+ import org.embulk.spi.SchemaConfig;
17
+ import org.embulk.spi.SchemaConfigException;
18
+ import org.embulk.spi.TestPageBuilderReader;
19
+ import org.embulk.spi.time.Timestamp;
20
+ import org.embulk.spi.type.Type;
21
+ import org.embulk.spi.util.InputStreamFileInput;
22
+ import org.embulk.spi.util.Newline;
23
+ import org.embulk.spi.util.Pages;
24
+ import org.joda.time.DateTimeZone;
25
+ import org.junit.Before;
26
+ import org.junit.Rule;
27
+ import org.junit.Test;
28
+ import org.msgpack.value.Value;
29
+
30
+ import java.io.ByteArrayInputStream;
31
+ import java.io.IOException;
32
+ import java.io.InputStream;
33
+ import java.nio.charset.Charset;
34
+ import java.nio.charset.StandardCharsets;
35
+ import java.util.List;
36
+
37
+ import static junit.framework.TestCase.assertEquals;
38
+ import static org.embulk.spi.type.Types.BOOLEAN;
39
+ import static org.embulk.spi.type.Types.DOUBLE;
40
+ import static org.embulk.spi.type.Types.JSON;
41
+ import static org.embulk.spi.type.Types.LONG;
42
+ import static org.embulk.spi.type.Types.STRING;
43
+ import static org.embulk.spi.type.Types.TIMESTAMP;
44
+ import static org.junit.Assert.assertTrue;
45
+ import static org.msgpack.value.ValueFactory.newArray;
46
+ import static org.msgpack.value.ValueFactory.newFloat;
47
+ import static org.msgpack.value.ValueFactory.newInteger;
48
+ import static org.msgpack.value.ValueFactory.newMap;
49
+ import static org.msgpack.value.ValueFactory.newString;
50
+
51
+ public class TestJoniRegexpParserPlugin
52
+ {
53
+ private ConfigSource config;
54
+ private JoniRegexpParserPlugin plugin;
55
+ private TestPageBuilderReader.MockPageOutput output;
56
+
57
+ // TODO
58
+ // private static final String RESOURCE_NAME_PREFIX = "org/embulk/parser/joni/";
59
+
60
+ @Before
61
+ public void createResource()
62
+ {
63
+ config = config().set("type", "joni_regexp");
64
+ plugin = new JoniRegexpParserPlugin();
65
+ recreatePageOutput();
66
+ }
67
+
68
+ @Rule
69
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
70
+
71
+ // TODO use TestingEmbulk
72
+ // @Rule
73
+ // public TestingEmbulk embulk = TestingEmbulk.builder().build();
74
+
75
+ @Test
76
+ public void basicApacheCombinedLogTest()
77
+ throws Exception
78
+ {
79
+ // TODO Use TestingEmbulk
80
+ // ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
81
+
82
+ SchemaConfig schema = schema(
83
+ column("host", STRING), column("user", STRING),
84
+ column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
85
+ column("method", STRING), column("path", STRING),
86
+ column("code", STRING), column("size", STRING), column("referer", STRING),
87
+ column("agent", STRING));
88
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
89
+ .set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$");
90
+
91
+ // config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
92
+
93
+ transaction(config, fileInput(
94
+ "224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
95
+ "128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\""
96
+ ));
97
+
98
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
99
+ assertEquals(2, records.size());
100
+
101
+ Object[] record;
102
+ {
103
+ record = records.get(0);
104
+ assertEquals("224.126.227.109", record[0]);
105
+ assertEquals("-", record[1]);
106
+ assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
107
+ assertEquals("GET", record[3]);
108
+ assertEquals("/category/games", record[4]);
109
+ // assertEquals("HTTP/1.1", record[5]);
110
+ assertEquals("200", record[5]);
111
+ assertEquals("85", record[6]);
112
+ assertEquals("-", record[7]);
113
+ assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
114
+ }
115
+ {
116
+ record = records.get(1);
117
+ assertEquals("128.27.132.24", record[0]);
118
+ assertEquals("bob", record[1]);
119
+ assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
120
+ assertEquals("GET", record[3]);
121
+ assertEquals("/category/health", record[4]);
122
+ // assertEquals("HTTP/1.1", record[5]);
123
+ assertEquals("200", record[5]);
124
+ assertEquals("103", record[6]);
125
+ assertEquals("/category/electronics?from=20", record[7]);
126
+ assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
127
+ }
128
+ }
129
+
130
+ @Test
131
+ public void basicApacheCombinedLogTestEnableStopOnInvalidRecord()
132
+ throws Exception
133
+ {
134
+ // TODO Use TestingEmbulk
135
+ // ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
136
+
137
+ SchemaConfig schema = schema(
138
+ column("host", STRING), column("user", STRING),
139
+ column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
140
+ column("method", STRING), column("path", STRING),
141
+ column("code", STRING), column("size", STRING), column("referer", STRING),
142
+ column("agent", STRING));
143
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
144
+ .set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
145
+ .set("stop_on_invalid_record", false);
146
+
147
+ // config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
148
+
149
+ transaction(config, fileInput(
150
+ "224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
151
+ "invalid_record1",
152
+ "128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
153
+ "invalid_record2"
154
+ ));
155
+
156
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
157
+ assertEquals(2, records.size());
158
+
159
+ Object[] record;
160
+ {
161
+ record = records.get(0);
162
+ assertEquals("224.126.227.109", record[0]);
163
+ assertEquals("-", record[1]);
164
+ assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
165
+ assertEquals("GET", record[3]);
166
+ assertEquals("/category/games", record[4]);
167
+ // assertEquals("HTTP/1.1", record[5]);
168
+ assertEquals("200", record[5]);
169
+ assertEquals("85", record[6]);
170
+ assertEquals("-", record[7]);
171
+ assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
172
+ }
173
+ {
174
+ record = records.get(1);
175
+ assertEquals("128.27.132.24", record[0]);
176
+ assertEquals("bob", record[1]);
177
+ assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
178
+ assertEquals("GET", record[3]);
179
+ assertEquals("/category/health", record[4]);
180
+ assertEquals("200", record[5]);
181
+ assertEquals("103", record[6]);
182
+ assertEquals("/category/electronics?from=20", record[7]);
183
+ assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
184
+ }
185
+ }
186
+
187
+ @Test(expected = DataException.class)
188
+ public void checkInvalidRecord()
189
+ throws Exception
190
+ {
191
+ SchemaConfig schema = schema(
192
+ column("host", STRING), column("user", STRING),
193
+ column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
194
+ column("method", STRING), column("path", STRING),
195
+ column("code", STRING), column("size", STRING), column("referer", STRING),
196
+ column("agent", STRING));
197
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
198
+ .set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
199
+ .set("stop_on_invalid_record", true);
200
+
201
+ transaction(config, fileInput(
202
+ "224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
203
+ "invalid_record1",
204
+ "128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
205
+ "invalid_record2"
206
+ ));
207
+ }
208
+
209
+ @Test
210
+ public void checkLookahead()
211
+ throws Exception
212
+ {
213
+
214
+ SchemaConfig schema = schema(
215
+ column("string1", STRING), column("string2", STRING));
216
+
217
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
218
+ .set("format", "^\"(?<string1>.*?)\"(?<!\\.) \"(?<string2>.*?)\"");
219
+
220
+ transaction(config, fileInput(
221
+ "\"This is a \\\"test\\\".\" \"This is a \\\"test\\\".\"]}"));
222
+
223
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
224
+ assertEquals(1, records.size());
225
+
226
+ Object[] record;
227
+ {
228
+ record = records.get(0);
229
+ assertEquals("This is a \\\"test\\\".", record[0]);
230
+ assertEquals("This is a \\", record[1]);
231
+ }
232
+ }
233
+
234
+ @Test(expected = DataException.class)
235
+ public void checkInvalidFormat()
236
+ throws Exception
237
+ {
238
+
239
+ SchemaConfig schema = schema(
240
+ column("date", TIMESTAMP, config().set("format", "%H:%M:%S")));
241
+
242
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
243
+ .set("format", "(?<date>\\S+)");
244
+
245
+ transaction(config, fileInput(
246
+ "2017-02-19 This is a test."));
247
+ }
248
+
249
+ @Test
250
+ public void checkAllColumnTypes()
251
+ throws Exception
252
+ {
253
+
254
+ SchemaConfig schema = schema(
255
+ column("bool", BOOLEAN), column("string", STRING),
256
+ column("time", TIMESTAMP, config().set("format", "%Y-%m-%d %H:%M:%S")),
257
+ column("long", LONG), column("double", DOUBLE),
258
+ column("json", JSON));
259
+
260
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
261
+ .set("format", "^(?<bool>[^\t]*)\t(?<string>[^\t]*)\t(?<time>[^\t]*)\t*(?<long>[^\t]*)\t(?<double>[^\t]*)\t(?<json>[^\t]*)$");
262
+
263
+ transaction(config, fileInput(
264
+ "true\tマイケル・ジャクソン\t2009-6-25 00:00:00\t456789\t123.456\t{\"name\":\"Michael Jackson\",\"birth\":\"1958-8-29\",\"age\":50,\"Bad World Tour\":4.4,\"album\":[\"Got To Be There\",\"Ben\",\"Music & Me\"]}"));
265
+
266
+ Value json = newMap(newString("name"), newString("Michael Jackson"),
267
+ newString("birth"), newString("1958-8-29"),
268
+ newString("age"), newInteger(50),
269
+ newString("Bad World Tour"), newFloat(4.4),
270
+ newString("album"), newArray(newString("Got To Be There"), newString("Ben"), newString("Music & Me")));
271
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
272
+ assertEquals(1, records.size());
273
+
274
+ Object[] record;
275
+ {
276
+ record = records.get(0);
277
+ assertEquals(true, record[0]);
278
+ assertEquals("マイケル・ジャクソン", record[1]);
279
+ assertEquals(Timestamp.ofEpochSecond(1245888000L), record[2]);
280
+ assertEquals(456789L, record[3]);
281
+ assertEquals(123.456, record[4]);
282
+ assertEquals(json, record[5]);
283
+ }
284
+ }
285
+
286
+ @Test
287
+ public void checkDefaultValues()
288
+ {
289
+ ConfigSource config = Exec.newConfigSource()
290
+ .set("columns", ImmutableList.of(
291
+ ImmutableMap.of(
292
+ "name", "name",
293
+ "type", "string")))
294
+ .set("format", "(?<name>a*)");
295
+
296
+ JoniRegexpParserPlugin.PluginTask task = config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
297
+
298
+ assertEquals(Charset.forName("utf-8"), task.getCharset());
299
+ assertEquals(Newline.CRLF, task.getNewline());
300
+ assertEquals(DateTimeZone.UTC, task.getDefaultTimeZone());
301
+ assertEquals("%Y-%m-%d %H:%M:%S.%N %z", task.getDefaultTimestampFormat());
302
+ assertEquals(false, task.getStopOnInvalidRecord());
303
+ //assertEquals( true, task.getDefaultTypecast());
304
+
305
+ }
306
+
307
+ @Test(expected = ConfigException.class)
308
+ public void checkColumnsRequired()
309
+ {
310
+ ConfigSource config = Exec.newConfigSource()
311
+ .set("format", "(?<name>a*)");
312
+
313
+ config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
314
+ }
315
+
316
+ @Test(expected = ConfigException.class)
317
+ public void checkFormatRequired()
318
+ {
319
+ ConfigSource config = Exec.newConfigSource()
320
+ .set("columns", ImmutableList.of(
321
+ ImmutableMap.of(
322
+ "name", "name",
323
+ "type", "string")));
324
+
325
+ config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
326
+ }
327
+
328
+ @Test // (expected = SchemaConfigException.class)
329
+ public void checkNamedCaptureColumnNotFound()
330
+ {
331
+ ConfigSource config2 = Exec.newConfigSource()
332
+ .set("columns", ImmutableList.of(
333
+ ImmutableMap.of(
334
+ "name", "hoge",
335
+ "type", "string")))
336
+ .set("format", "(?<no_capture_name>a*)");
337
+
338
+ config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
339
+ try {
340
+ transaction(config2, fileInput(""));
341
+ }
342
+ catch (Throwable t) {
343
+ assertTrue(t instanceof SchemaConfigException);
344
+ }
345
+ }
346
+
347
+ @Test(expected = org.embulk.config.ConfigException.class)
348
+ public void checkNoNamedCapturingGroupRegex()
349
+ throws Exception
350
+ {
351
+ ConfigSource config2 = Exec.newConfigSource()
352
+ .set("columns", ImmutableList.of(
353
+ ImmutableMap.of(
354
+ "name", "hoge",
355
+ "type", "string")))
356
+ .set("format", "no named capturing group regex");
357
+
358
+ config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
359
+ transaction(config2, fileInput(""));
360
+ }
361
+
362
+ @Test(expected = org.joni.exception.SyntaxException.class)
363
+ public void checkInvalidRegexSyntax()
364
+ throws Exception
365
+ {
366
+ ConfigSource config2 = Exec.newConfigSource()
367
+ .set("columns", ImmutableList.of(
368
+ ImmutableMap.of(
369
+ "name", "hoge",
370
+ "type", "string")))
371
+ .set("format", "(?<invalid_regex");
372
+
373
+ config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
374
+ transaction(config2, fileInput(""));
375
+ }
376
+
377
+ private void transaction(ConfigSource config, final FileInput input)
378
+ {
379
+ plugin.transaction(config, new ParserPlugin.Control()
380
+ {
381
+ @Override
382
+ public void run(TaskSource taskSource, Schema schema)
383
+ {
384
+ plugin.run(taskSource, schema, input, output);
385
+ }
386
+ });
387
+ }
388
+
389
+ private void recreatePageOutput()
390
+ {
391
+ output = new TestPageBuilderReader.MockPageOutput();
392
+ }
393
+
394
+ private FileInput fileInput(String... lines)
395
+ throws Exception
396
+ {
397
+ StringBuilder sb = new StringBuilder();
398
+ for (String line : lines) {
399
+ sb.append(line).append("\n");
400
+ }
401
+
402
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8));
403
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
404
+ }
405
+
406
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
407
+ throws IOException
408
+ {
409
+ return new InputStreamFileInput.IteratorProvider(
410
+ ImmutableList.copyOf(inputStreams));
411
+ }
412
+
413
+ private ConfigSource config()
414
+ {
415
+ return runtime.getExec().newConfigSource();
416
+ }
417
+
418
+ private SchemaConfig schema(ColumnConfig... columns)
419
+ {
420
+ return new SchemaConfig(Lists.newArrayList(columns));
421
+ }
422
+
423
+ private ColumnConfig column(String name, Type type)
424
+ {
425
+ return column(name, type, config());
426
+ }
427
+
428
+ private ColumnConfig column(String name, Type type, ConfigSource option)
429
+ {
430
+ return new ColumnConfig(name, type, option);
431
+ }
432
+ }