embulk-parser-joni_regexp 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +7 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +107 -0
  6. data/build.gradle +101 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/example/apache.txt +5000 -0
  10. data/example/apache.yml +18 -0
  11. data/example/conf.yml +12 -0
  12. data/example/conf2.yml +12 -0
  13. data/example/conf3.yml +12 -0
  14. data/example/seed.yml +8 -0
  15. data/example/test.txt +8 -0
  16. data/example/test2.txt +3 -0
  17. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  18. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  19. data/gradlew +169 -0
  20. data/gradlew.bat +84 -0
  21. data/lib/embulk/guess/joni_regexp.rb +28 -0
  22. data/lib/embulk/parser/joni_regexp.rb +3 -0
  23. data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
  24. data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
  25. data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
  26. data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
  27. data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
  28. data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
  29. data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
  30. data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
  31. data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
  32. data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
  33. data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
  34. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
  35. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
  36. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
  37. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
  38. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
  39. metadata +112 -0
@@ -0,0 +1,47 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ public class LongCast
7
+ {
8
+ private LongCast() {}
9
+
10
+ private static String buildErrorMessage(String as, long value)
11
+ {
12
+ return String.format("cannot cast long to %s: \"%s\"", as, value);
13
+ }
14
+
15
+ public static boolean asBoolean(long value) throws DataException
16
+ {
17
+ if (value == 1) {
18
+ return true;
19
+ }
20
+ else if (value == 0) {
21
+ return false;
22
+ }
23
+ else {
24
+ throw new DataException(buildErrorMessage("boolean", value));
25
+ }
26
+ }
27
+
28
+ public static long asLong(long value) throws DataException
29
+ {
30
+ return value;
31
+ }
32
+
33
+ public static double asDouble(long value) throws DataException
34
+ {
35
+ return (double) value;
36
+ }
37
+
38
+ public static String asString(long value) throws DataException
39
+ {
40
+ return String.valueOf(value);
41
+ }
42
+
43
+ public static Timestamp asTimestamp(long value) throws DataException
44
+ {
45
+ return Timestamp.ofEpochSecond(value);
46
+ }
47
+ }
@@ -0,0 +1,82 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import com.google.common.collect.ImmutableSet;
4
+ import org.embulk.spi.DataException;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParseException;
7
+ import org.embulk.spi.time.TimestampParser;
8
+
9
+ public class StringCast
10
+ {
11
+ // copy from csv plugin
12
+ public static final ImmutableSet<String> TRUE_STRINGS =
13
+ ImmutableSet.of(
14
+ "true", "True", "TRUE",
15
+ "yes", "Yes", "YES",
16
+ "t", "T", "y", "Y",
17
+ "on", "On", "ON",
18
+ "1");
19
+
20
+ public static final ImmutableSet<String> FALSE_STRINGS =
21
+ ImmutableSet.of(
22
+ "false", "False", "FALSE",
23
+ "no", "No", "NO",
24
+ "f", "F", "n", "N",
25
+ "off", "Off", "OFF",
26
+ "0");
27
+
28
+ private StringCast() {}
29
+
30
+ private static String buildErrorMessage(String as, String value)
31
+ {
32
+ return String.format("cannot cast String to %s: \"%s\"", as, value);
33
+ }
34
+
35
+ public static boolean asBoolean(String value) throws DataException
36
+ {
37
+ if (TRUE_STRINGS.contains(value)) {
38
+ return true;
39
+ }
40
+ else if (FALSE_STRINGS.contains(value)) {
41
+ return false;
42
+ }
43
+ else {
44
+ throw new DataException(buildErrorMessage("boolean", value));
45
+ }
46
+ }
47
+
48
+ public static long asLong(String value) throws DataException
49
+ {
50
+ try {
51
+ return Long.parseLong(value);
52
+ }
53
+ catch (NumberFormatException ex) {
54
+ throw new DataException(buildErrorMessage("long", value), ex);
55
+ }
56
+ }
57
+
58
+ public static double asDouble(String value) throws DataException
59
+ {
60
+ try {
61
+ return Double.parseDouble(value);
62
+ }
63
+ catch (NumberFormatException ex) {
64
+ throw new DataException(buildErrorMessage("double", value), ex);
65
+ }
66
+ }
67
+
68
+ public static String asString(String value) throws DataException
69
+ {
70
+ return value;
71
+ }
72
+
73
+ public static Timestamp asTimestamp(String value, TimestampParser parser) throws DataException
74
+ {
75
+ try {
76
+ return parser.parse(value);
77
+ }
78
+ catch (TimestampParseException ex) {
79
+ throw new DataException(buildErrorMessage("timestamp", value), ex);
80
+ }
81
+ }
82
+ }
@@ -0,0 +1,256 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.spi.DataException;
5
+ import org.embulk.spi.time.Timestamp;
6
+ import org.embulk.spi.time.TimestampParser;
7
+ import org.joda.time.DateTimeZone;
8
+ import org.jruby.embed.ScriptingContainer;
9
+ import org.junit.Before;
10
+ import org.junit.Rule;
11
+ import org.junit.Test;
12
+ import org.msgpack.value.MapValue;
13
+ import org.msgpack.value.Value;
14
+ import org.msgpack.value.ValueFactory;
15
+
16
+ import static org.junit.Assert.assertEquals;
17
+ import static org.junit.Assert.assertTrue;
18
+ import static org.junit.Assert.fail;
19
+
20
+ public class TestColumnCaster
21
+ {
22
+ @Rule
23
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
24
+ public MapValue mapValue;
25
+ public DataException thrown;
26
+ public ScriptingContainer jruby;
27
+ public TimestampParser parser;
28
+
29
+ @Before
30
+ public void createResource()
31
+ {
32
+ jruby = new ScriptingContainer();
33
+ thrown = new DataException("any");
34
+ Value[] kvs = new Value[2];
35
+ kvs[0] = ValueFactory.newString("k");
36
+ kvs[1] = ValueFactory.newString("v");
37
+ mapValue = ValueFactory.newMap(kvs);
38
+ parser = new TimestampParser(jruby, "%Y-%m-%d %H:%M:%S.%N", DateTimeZone.UTC);
39
+ }
40
+
41
+ @Test
42
+ public void asBooleanFromBoolean()
43
+ {
44
+ assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newBoolean(true)));
45
+ }
46
+
47
+ @Test
48
+ public void asBooleanFromInteger()
49
+ {
50
+ assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newInteger(1)));
51
+ try {
52
+ ColumnCaster.asBoolean(ValueFactory.newInteger(2));
53
+ fail();
54
+ }
55
+ catch (Throwable t) {
56
+ assertTrue(t instanceof DataException);
57
+ }
58
+ }
59
+
60
+ @Test
61
+ public void asBooleanFromFloat()
62
+ {
63
+ try {
64
+ ColumnCaster.asBoolean(ValueFactory.newFloat(1.1));
65
+ fail();
66
+ }
67
+ catch (Throwable t) {
68
+ assertTrue(t instanceof DataException);
69
+ }
70
+ }
71
+
72
+ @Test
73
+ public void asBooleanFromString()
74
+ {
75
+ assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newString("true")));
76
+ try {
77
+ ColumnCaster.asBoolean(ValueFactory.newString("foo"));
78
+ fail();
79
+ }
80
+ catch (Throwable t) {
81
+ assertTrue(t instanceof DataException);
82
+ }
83
+ }
84
+
85
+ @Test
86
+ public void asBooleanFromJson()
87
+ {
88
+ try {
89
+ ColumnCaster.asBoolean(mapValue);
90
+ fail();
91
+ }
92
+ catch (Throwable t) {
93
+ assertTrue(t instanceof DataException);
94
+ }
95
+ }
96
+
97
+ @Test
98
+ public void asLongFromBoolean()
99
+ {
100
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
101
+ }
102
+
103
+ @Test
104
+ public void asLongFromInteger()
105
+ {
106
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
107
+ }
108
+
109
+ @Test
110
+ public void asLongFromFloat()
111
+ {
112
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
113
+ }
114
+
115
+ @Test
116
+ public void asLongFromString()
117
+ {
118
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
119
+ try {
120
+ ColumnCaster.asLong(ValueFactory.newString("foo"));
121
+ fail();
122
+ }
123
+ catch (Throwable t) {
124
+ assertTrue(t instanceof DataException);
125
+ }
126
+ }
127
+
128
+ @Test
129
+ public void asLongFromJson()
130
+ {
131
+ try {
132
+ ColumnCaster.asLong(mapValue);
133
+ fail();
134
+ }
135
+ catch (Throwable t) {
136
+ assertTrue(t instanceof DataException);
137
+ }
138
+ }
139
+
140
+ @Test
141
+ public void asDoubleFromBoolean()
142
+ {
143
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
144
+ }
145
+
146
+ @Test
147
+ public void asDoubleFromInteger()
148
+ {
149
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
150
+ }
151
+
152
+ @Test
153
+ public void asDoubleFromFloat()
154
+ {
155
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
156
+ }
157
+
158
+ @Test
159
+ public void asDoubleFromString()
160
+ {
161
+ assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
162
+ try {
163
+ ColumnCaster.asLong(ValueFactory.newString("foo"));
164
+ fail();
165
+ }
166
+ catch (Throwable t) {
167
+ assertTrue(t instanceof DataException);
168
+ }
169
+ }
170
+
171
+ @Test
172
+ public void asDoubleFromJson()
173
+ {
174
+ try {
175
+ ColumnCaster.asLong(mapValue);
176
+ fail();
177
+ }
178
+ catch (Throwable t) {
179
+ assertTrue(t instanceof DataException);
180
+ }
181
+ }
182
+
183
+ @Test
184
+ public void asStringFromBoolean()
185
+ {
186
+ assertEquals("true", ColumnCaster.asString(ValueFactory.newBoolean(true)));
187
+ }
188
+
189
+ @Test
190
+ public void asStringFromInteger()
191
+ {
192
+ assertEquals("1", ColumnCaster.asString(ValueFactory.newInteger(1)));
193
+ }
194
+
195
+ @Test
196
+ public void asStringFromFloat()
197
+ {
198
+ assertEquals("1.5", ColumnCaster.asString(ValueFactory.newFloat(1.5)));
199
+ }
200
+
201
+ @Test
202
+ public void asStringFromString()
203
+ {
204
+ assertEquals("1", ColumnCaster.asString(ValueFactory.newString("1")));
205
+ }
206
+
207
+ @Test
208
+ public void asStringFromJson()
209
+ {
210
+ assertEquals("{\"k\":\"v\"}", ColumnCaster.asString(mapValue));
211
+ }
212
+
213
+ @Test
214
+ public void asTimestampFromBoolean()
215
+ {
216
+ try {
217
+ ColumnCaster.asTimestamp(ValueFactory.newBoolean(true), parser);
218
+ fail();
219
+ }
220
+ catch (Throwable t) {
221
+ assertTrue(t instanceof DataException);
222
+ }
223
+ }
224
+
225
+ @Test
226
+ public void asTimestampFromInteger()
227
+ {
228
+ assertEquals(1, ColumnCaster.asTimestamp(ValueFactory.newInteger(1), parser).getEpochSecond());
229
+ }
230
+
231
+ @Test
232
+ public void asTimestampFromFloat()
233
+ {
234
+ Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
235
+ assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newFloat(1463084053.5), parser));
236
+ }
237
+
238
+ @Test
239
+ public void asTimestampFromString()
240
+ {
241
+ Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
242
+ assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newString("2016-05-12 20:14:13.5"), parser));
243
+ }
244
+
245
+ @Test
246
+ public void asTimestampFromJson()
247
+ {
248
+ try {
249
+ ColumnCaster.asTimestamp(mapValue, parser);
250
+ fail();
251
+ }
252
+ catch (Throwable t) {
253
+ assertTrue(t instanceof DataException);
254
+ }
255
+ }
256
+ }
@@ -0,0 +1,432 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import com.google.common.collect.Lists;
6
+ import org.embulk.EmbulkTestRuntime;
7
+ import org.embulk.config.ConfigException;
8
+ import org.embulk.config.ConfigSource;
9
+ import org.embulk.config.TaskSource;
10
+ import org.embulk.spi.ColumnConfig;
11
+ import org.embulk.spi.DataException;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FileInput;
14
+ import org.embulk.spi.ParserPlugin;
15
+ import org.embulk.spi.Schema;
16
+ import org.embulk.spi.SchemaConfig;
17
+ import org.embulk.spi.SchemaConfigException;
18
+ import org.embulk.spi.TestPageBuilderReader;
19
+ import org.embulk.spi.time.Timestamp;
20
+ import org.embulk.spi.type.Type;
21
+ import org.embulk.spi.util.InputStreamFileInput;
22
+ import org.embulk.spi.util.Newline;
23
+ import org.embulk.spi.util.Pages;
24
+ import org.joda.time.DateTimeZone;
25
+ import org.junit.Before;
26
+ import org.junit.Rule;
27
+ import org.junit.Test;
28
+ import org.msgpack.value.Value;
29
+
30
+ import java.io.ByteArrayInputStream;
31
+ import java.io.IOException;
32
+ import java.io.InputStream;
33
+ import java.nio.charset.Charset;
34
+ import java.nio.charset.StandardCharsets;
35
+ import java.util.List;
36
+
37
+ import static junit.framework.TestCase.assertEquals;
38
+ import static org.embulk.spi.type.Types.BOOLEAN;
39
+ import static org.embulk.spi.type.Types.DOUBLE;
40
+ import static org.embulk.spi.type.Types.JSON;
41
+ import static org.embulk.spi.type.Types.LONG;
42
+ import static org.embulk.spi.type.Types.STRING;
43
+ import static org.embulk.spi.type.Types.TIMESTAMP;
44
+ import static org.junit.Assert.assertTrue;
45
+ import static org.msgpack.value.ValueFactory.newArray;
46
+ import static org.msgpack.value.ValueFactory.newFloat;
47
+ import static org.msgpack.value.ValueFactory.newInteger;
48
+ import static org.msgpack.value.ValueFactory.newMap;
49
+ import static org.msgpack.value.ValueFactory.newString;
50
+
51
+ public class TestJoniRegexpParserPlugin
52
+ {
53
+ private ConfigSource config;
54
+ private JoniRegexpParserPlugin plugin;
55
+ private TestPageBuilderReader.MockPageOutput output;
56
+
57
+ // TODO
58
+ // private static final String RESOURCE_NAME_PREFIX = "org/embulk/parser/joni/";
59
+
60
+ @Before
61
+ public void createResource()
62
+ {
63
+ config = config().set("type", "joni_regexp");
64
+ plugin = new JoniRegexpParserPlugin();
65
+ recreatePageOutput();
66
+ }
67
+
68
+ @Rule
69
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
70
+
71
+ // TODO use TestingEmbulk
72
+ // @Rule
73
+ // public TestingEmbulk embulk = TestingEmbulk.builder().build();
74
+
75
+ @Test
76
+ public void basicApacheCombinedLogTest()
77
+ throws Exception
78
+ {
79
+ // TODO Use TestingEmbulk
80
+ // ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
81
+
82
+ SchemaConfig schema = schema(
83
+ column("host", STRING), column("user", STRING),
84
+ column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
85
+ column("method", STRING), column("path", STRING),
86
+ column("code", STRING), column("size", STRING), column("referer", STRING),
87
+ column("agent", STRING));
88
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
89
+ .set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$");
90
+
91
+ // config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
92
+
93
+ transaction(config, fileInput(
94
+ "224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
95
+ "128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\""
96
+ ));
97
+
98
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
99
+ assertEquals(2, records.size());
100
+
101
+ Object[] record;
102
+ {
103
+ record = records.get(0);
104
+ assertEquals("224.126.227.109", record[0]);
105
+ assertEquals("-", record[1]);
106
+ assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
107
+ assertEquals("GET", record[3]);
108
+ assertEquals("/category/games", record[4]);
109
+ // assertEquals("HTTP/1.1", record[5]);
110
+ assertEquals("200", record[5]);
111
+ assertEquals("85", record[6]);
112
+ assertEquals("-", record[7]);
113
+ assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
114
+ }
115
+ {
116
+ record = records.get(1);
117
+ assertEquals("128.27.132.24", record[0]);
118
+ assertEquals("bob", record[1]);
119
+ assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
120
+ assertEquals("GET", record[3]);
121
+ assertEquals("/category/health", record[4]);
122
+ // assertEquals("HTTP/1.1", record[5]);
123
+ assertEquals("200", record[5]);
124
+ assertEquals("103", record[6]);
125
+ assertEquals("/category/electronics?from=20", record[7]);
126
+ assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
127
+ }
128
+ }
129
+
130
+ @Test
131
+ public void basicApacheCombinedLogTestEnableStopOnInvalidRecord()
132
+ throws Exception
133
+ {
134
+ // TODO Use TestingEmbulk
135
+ // ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
136
+
137
+ SchemaConfig schema = schema(
138
+ column("host", STRING), column("user", STRING),
139
+ column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
140
+ column("method", STRING), column("path", STRING),
141
+ column("code", STRING), column("size", STRING), column("referer", STRING),
142
+ column("agent", STRING));
143
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
144
+ .set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
145
+ .set("stop_on_invalid_record", false);
146
+
147
+ // config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
148
+
149
+ transaction(config, fileInput(
150
+ "224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
151
+ "invalid_record1",
152
+ "128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
153
+ "invalid_record2"
154
+ ));
155
+
156
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
157
+ assertEquals(2, records.size());
158
+
159
+ Object[] record;
160
+ {
161
+ record = records.get(0);
162
+ assertEquals("224.126.227.109", record[0]);
163
+ assertEquals("-", record[1]);
164
+ assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
165
+ assertEquals("GET", record[3]);
166
+ assertEquals("/category/games", record[4]);
167
+ // assertEquals("HTTP/1.1", record[5]);
168
+ assertEquals("200", record[5]);
169
+ assertEquals("85", record[6]);
170
+ assertEquals("-", record[7]);
171
+ assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
172
+ }
173
+ {
174
+ record = records.get(1);
175
+ assertEquals("128.27.132.24", record[0]);
176
+ assertEquals("bob", record[1]);
177
+ assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
178
+ assertEquals("GET", record[3]);
179
+ assertEquals("/category/health", record[4]);
180
+ assertEquals("200", record[5]);
181
+ assertEquals("103", record[6]);
182
+ assertEquals("/category/electronics?from=20", record[7]);
183
+ assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
184
+ }
185
+ }
186
+
187
+ @Test(expected = DataException.class)
188
+ public void checkInvalidRecord()
189
+ throws Exception
190
+ {
191
+ SchemaConfig schema = schema(
192
+ column("host", STRING), column("user", STRING),
193
+ column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
194
+ column("method", STRING), column("path", STRING),
195
+ column("code", STRING), column("size", STRING), column("referer", STRING),
196
+ column("agent", STRING));
197
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
198
+ .set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
199
+ .set("stop_on_invalid_record", true);
200
+
201
+ transaction(config, fileInput(
202
+ "224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
203
+ "invalid_record1",
204
+ "128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
205
+ "invalid_record2"
206
+ ));
207
+ }
208
+
209
+ @Test
210
+ public void checkLookahead()
211
+ throws Exception
212
+ {
213
+
214
+ SchemaConfig schema = schema(
215
+ column("string1", STRING), column("string2", STRING));
216
+
217
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
218
+ .set("format", "^\"(?<string1>.*?)\"(?<!\\.) \"(?<string2>.*?)\"");
219
+
220
+ transaction(config, fileInput(
221
+ "\"This is a \\\"test\\\".\" \"This is a \\\"test\\\".\"]}"));
222
+
223
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
224
+ assertEquals(1, records.size());
225
+
226
+ Object[] record;
227
+ {
228
+ record = records.get(0);
229
+ assertEquals("This is a \\\"test\\\".", record[0]);
230
+ assertEquals("This is a \\", record[1]);
231
+ }
232
+ }
233
+
234
+ @Test(expected = DataException.class)
235
+ public void checkInvalidFormat()
236
+ throws Exception
237
+ {
238
+
239
+ SchemaConfig schema = schema(
240
+ column("date", TIMESTAMP, config().set("format", "%H:%M:%S")));
241
+
242
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
243
+ .set("format", "(?<date>\\S+)");
244
+
245
+ transaction(config, fileInput(
246
+ "2017-02-19 This is a test."));
247
+ }
248
+
249
+ @Test
250
+ public void checkAllColumnTypes()
251
+ throws Exception
252
+ {
253
+
254
+ SchemaConfig schema = schema(
255
+ column("bool", BOOLEAN), column("string", STRING),
256
+ column("time", TIMESTAMP, config().set("format", "%Y-%m-%d %H:%M:%S")),
257
+ column("long", LONG), column("double", DOUBLE),
258
+ column("json", JSON));
259
+
260
+ ConfigSource config = this.config.deepCopy().set("columns", schema)
261
+ .set("format", "^(?<bool>[^\t]*)\t(?<string>[^\t]*)\t(?<time>[^\t]*)\t*(?<long>[^\t]*)\t(?<double>[^\t]*)\t(?<json>[^\t]*)$");
262
+
263
+ transaction(config, fileInput(
264
+ "true\tマイケル・ジャクソン\t2009-6-25 00:00:00\t456789\t123.456\t{\"name\":\"Michael Jackson\",\"birth\":\"1958-8-29\",\"age\":50,\"Bad World Tour\":4.4,\"album\":[\"Got To Be There\",\"Ben\",\"Music & Me\"]}"));
265
+
266
+ Value json = newMap(newString("name"), newString("Michael Jackson"),
267
+ newString("birth"), newString("1958-8-29"),
268
+ newString("age"), newInteger(50),
269
+ newString("Bad World Tour"), newFloat(4.4),
270
+ newString("album"), newArray(newString("Got To Be There"), newString("Ben"), newString("Music & Me")));
271
+ List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
272
+ assertEquals(1, records.size());
273
+
274
+ Object[] record;
275
+ {
276
+ record = records.get(0);
277
+ assertEquals(true, record[0]);
278
+ assertEquals("マイケル・ジャクソン", record[1]);
279
+ assertEquals(Timestamp.ofEpochSecond(1245888000L), record[2]);
280
+ assertEquals(456789L, record[3]);
281
+ assertEquals(123.456, record[4]);
282
+ assertEquals(json, record[5]);
283
+ }
284
+ }
285
+
286
+ @Test
287
+ public void checkDefaultValues()
288
+ {
289
+ ConfigSource config = Exec.newConfigSource()
290
+ .set("columns", ImmutableList.of(
291
+ ImmutableMap.of(
292
+ "name", "name",
293
+ "type", "string")))
294
+ .set("format", "(?<name>a*)");
295
+
296
+ JoniRegexpParserPlugin.PluginTask task = config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
297
+
298
+ assertEquals(Charset.forName("utf-8"), task.getCharset());
299
+ assertEquals(Newline.CRLF, task.getNewline());
300
+ assertEquals(DateTimeZone.UTC, task.getDefaultTimeZone());
301
+ assertEquals("%Y-%m-%d %H:%M:%S.%N %z", task.getDefaultTimestampFormat());
302
+ assertEquals(false, task.getStopOnInvalidRecord());
303
+ //assertEquals( true, task.getDefaultTypecast());
304
+
305
+ }
306
+
307
+ @Test(expected = ConfigException.class)
308
+ public void checkColumnsRequired()
309
+ {
310
+ ConfigSource config = Exec.newConfigSource()
311
+ .set("format", "(?<name>a*)");
312
+
313
+ config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
314
+ }
315
+
316
+ @Test(expected = ConfigException.class)
317
+ public void checkFormatRequired()
318
+ {
319
+ ConfigSource config = Exec.newConfigSource()
320
+ .set("columns", ImmutableList.of(
321
+ ImmutableMap.of(
322
+ "name", "name",
323
+ "type", "string")));
324
+
325
+ config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
326
+ }
327
+
328
+ @Test // (expected = SchemaConfigException.class)
329
+ public void checkNamedCaptureColumnNotFound()
330
+ {
331
+ ConfigSource config2 = Exec.newConfigSource()
332
+ .set("columns", ImmutableList.of(
333
+ ImmutableMap.of(
334
+ "name", "hoge",
335
+ "type", "string")))
336
+ .set("format", "(?<no_capture_name>a*)");
337
+
338
+ config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
339
+ try {
340
+ transaction(config2, fileInput(""));
341
+ }
342
+ catch (Throwable t) {
343
+ assertTrue(t instanceof SchemaConfigException);
344
+ }
345
+ }
346
+
347
+ @Test(expected = org.embulk.config.ConfigException.class)
348
+ public void checkNoNamedCapturingGroupRegex()
349
+ throws Exception
350
+ {
351
+ ConfigSource config2 = Exec.newConfigSource()
352
+ .set("columns", ImmutableList.of(
353
+ ImmutableMap.of(
354
+ "name", "hoge",
355
+ "type", "string")))
356
+ .set("format", "no named capturing group regex");
357
+
358
+ config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
359
+ transaction(config2, fileInput(""));
360
+ }
361
+
362
+ @Test(expected = org.joni.exception.SyntaxException.class)
363
+ public void checkInvalidRegexSyntax()
364
+ throws Exception
365
+ {
366
+ ConfigSource config2 = Exec.newConfigSource()
367
+ .set("columns", ImmutableList.of(
368
+ ImmutableMap.of(
369
+ "name", "hoge",
370
+ "type", "string")))
371
+ .set("format", "(?<invalid_regex");
372
+
373
+ config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
374
+ transaction(config2, fileInput(""));
375
+ }
376
+
377
+ private void transaction(ConfigSource config, final FileInput input)
378
+ {
379
+ plugin.transaction(config, new ParserPlugin.Control()
380
+ {
381
+ @Override
382
+ public void run(TaskSource taskSource, Schema schema)
383
+ {
384
+ plugin.run(taskSource, schema, input, output);
385
+ }
386
+ });
387
+ }
388
+
389
+ private void recreatePageOutput()
390
+ {
391
+ output = new TestPageBuilderReader.MockPageOutput();
392
+ }
393
+
394
+ private FileInput fileInput(String... lines)
395
+ throws Exception
396
+ {
397
+ StringBuilder sb = new StringBuilder();
398
+ for (String line : lines) {
399
+ sb.append(line).append("\n");
400
+ }
401
+
402
+ ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8));
403
+ return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
404
+ }
405
+
406
+ private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
407
+ throws IOException
408
+ {
409
+ return new InputStreamFileInput.IteratorProvider(
410
+ ImmutableList.copyOf(inputStreams));
411
+ }
412
+
413
+ private ConfigSource config()
414
+ {
415
+ return runtime.getExec().newConfigSource();
416
+ }
417
+
418
+ private SchemaConfig schema(ColumnConfig... columns)
419
+ {
420
+ return new SchemaConfig(Lists.newArrayList(columns));
421
+ }
422
+
423
+ private ColumnConfig column(String name, Type type)
424
+ {
425
+ return column(name, type, config());
426
+ }
427
+
428
+ private ColumnConfig column(String name, Type type, ConfigSource option)
429
+ {
430
+ return new ColumnConfig(name, type, option);
431
+ }
432
+ }