embulk-parser-joni_regexp 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.travis.yml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +107 -0
- data/build.gradle +101 -0
- data/config/checkstyle/checkstyle.xml +128 -0
- data/config/checkstyle/default.xml +108 -0
- data/example/apache.txt +5000 -0
- data/example/apache.yml +18 -0
- data/example/conf.yml +12 -0
- data/example/conf2.yml +12 -0
- data/example/conf3.yml +12 -0
- data/example/seed.yml +8 -0
- data/example/test.txt +8 -0
- data/example/test2.txt +3 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +169 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/guess/joni_regexp.rb +28 -0
- data/lib/embulk/parser/joni_regexp.rb +3 -0
- data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
- data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
- data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
- data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
- data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
- data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
- metadata +112 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp.cast;
|
2
|
+
|
3
|
+
import org.embulk.spi.DataException;
|
4
|
+
import org.embulk.spi.time.Timestamp;
|
5
|
+
|
6
|
+
public class LongCast
|
7
|
+
{
|
8
|
+
private LongCast() {}
|
9
|
+
|
10
|
+
private static String buildErrorMessage(String as, long value)
|
11
|
+
{
|
12
|
+
return String.format("cannot cast long to %s: \"%s\"", as, value);
|
13
|
+
}
|
14
|
+
|
15
|
+
public static boolean asBoolean(long value) throws DataException
|
16
|
+
{
|
17
|
+
if (value == 1) {
|
18
|
+
return true;
|
19
|
+
}
|
20
|
+
else if (value == 0) {
|
21
|
+
return false;
|
22
|
+
}
|
23
|
+
else {
|
24
|
+
throw new DataException(buildErrorMessage("boolean", value));
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
public static long asLong(long value) throws DataException
|
29
|
+
{
|
30
|
+
return value;
|
31
|
+
}
|
32
|
+
|
33
|
+
public static double asDouble(long value) throws DataException
|
34
|
+
{
|
35
|
+
return (double) value;
|
36
|
+
}
|
37
|
+
|
38
|
+
public static String asString(long value) throws DataException
|
39
|
+
{
|
40
|
+
return String.valueOf(value);
|
41
|
+
}
|
42
|
+
|
43
|
+
public static Timestamp asTimestamp(long value) throws DataException
|
44
|
+
{
|
45
|
+
return Timestamp.ofEpochSecond(value);
|
46
|
+
}
|
47
|
+
}
|
@@ -0,0 +1,82 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp.cast;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableSet;
|
4
|
+
import org.embulk.spi.DataException;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
import org.embulk.spi.time.TimestampParseException;
|
7
|
+
import org.embulk.spi.time.TimestampParser;
|
8
|
+
|
9
|
+
public class StringCast
|
10
|
+
{
|
11
|
+
// copy from csv plugin
|
12
|
+
public static final ImmutableSet<String> TRUE_STRINGS =
|
13
|
+
ImmutableSet.of(
|
14
|
+
"true", "True", "TRUE",
|
15
|
+
"yes", "Yes", "YES",
|
16
|
+
"t", "T", "y", "Y",
|
17
|
+
"on", "On", "ON",
|
18
|
+
"1");
|
19
|
+
|
20
|
+
public static final ImmutableSet<String> FALSE_STRINGS =
|
21
|
+
ImmutableSet.of(
|
22
|
+
"false", "False", "FALSE",
|
23
|
+
"no", "No", "NO",
|
24
|
+
"f", "F", "n", "N",
|
25
|
+
"off", "Off", "OFF",
|
26
|
+
"0");
|
27
|
+
|
28
|
+
private StringCast() {}
|
29
|
+
|
30
|
+
private static String buildErrorMessage(String as, String value)
|
31
|
+
{
|
32
|
+
return String.format("cannot cast String to %s: \"%s\"", as, value);
|
33
|
+
}
|
34
|
+
|
35
|
+
public static boolean asBoolean(String value) throws DataException
|
36
|
+
{
|
37
|
+
if (TRUE_STRINGS.contains(value)) {
|
38
|
+
return true;
|
39
|
+
}
|
40
|
+
else if (FALSE_STRINGS.contains(value)) {
|
41
|
+
return false;
|
42
|
+
}
|
43
|
+
else {
|
44
|
+
throw new DataException(buildErrorMessage("boolean", value));
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
public static long asLong(String value) throws DataException
|
49
|
+
{
|
50
|
+
try {
|
51
|
+
return Long.parseLong(value);
|
52
|
+
}
|
53
|
+
catch (NumberFormatException ex) {
|
54
|
+
throw new DataException(buildErrorMessage("long", value), ex);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
public static double asDouble(String value) throws DataException
|
59
|
+
{
|
60
|
+
try {
|
61
|
+
return Double.parseDouble(value);
|
62
|
+
}
|
63
|
+
catch (NumberFormatException ex) {
|
64
|
+
throw new DataException(buildErrorMessage("double", value), ex);
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
public static String asString(String value) throws DataException
|
69
|
+
{
|
70
|
+
return value;
|
71
|
+
}
|
72
|
+
|
73
|
+
public static Timestamp asTimestamp(String value, TimestampParser parser) throws DataException
|
74
|
+
{
|
75
|
+
try {
|
76
|
+
return parser.parse(value);
|
77
|
+
}
|
78
|
+
catch (TimestampParseException ex) {
|
79
|
+
throw new DataException(buildErrorMessage("timestamp", value), ex);
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
@@ -0,0 +1,256 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.spi.DataException;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
import org.embulk.spi.time.TimestampParser;
|
7
|
+
import org.joda.time.DateTimeZone;
|
8
|
+
import org.jruby.embed.ScriptingContainer;
|
9
|
+
import org.junit.Before;
|
10
|
+
import org.junit.Rule;
|
11
|
+
import org.junit.Test;
|
12
|
+
import org.msgpack.value.MapValue;
|
13
|
+
import org.msgpack.value.Value;
|
14
|
+
import org.msgpack.value.ValueFactory;
|
15
|
+
|
16
|
+
import static org.junit.Assert.assertEquals;
|
17
|
+
import static org.junit.Assert.assertTrue;
|
18
|
+
import static org.junit.Assert.fail;
|
19
|
+
|
20
|
+
public class TestColumnCaster
|
21
|
+
{
|
22
|
+
@Rule
|
23
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
24
|
+
public MapValue mapValue;
|
25
|
+
public DataException thrown;
|
26
|
+
public ScriptingContainer jruby;
|
27
|
+
public TimestampParser parser;
|
28
|
+
|
29
|
+
@Before
|
30
|
+
public void createResource()
|
31
|
+
{
|
32
|
+
jruby = new ScriptingContainer();
|
33
|
+
thrown = new DataException("any");
|
34
|
+
Value[] kvs = new Value[2];
|
35
|
+
kvs[0] = ValueFactory.newString("k");
|
36
|
+
kvs[1] = ValueFactory.newString("v");
|
37
|
+
mapValue = ValueFactory.newMap(kvs);
|
38
|
+
parser = new TimestampParser(jruby, "%Y-%m-%d %H:%M:%S.%N", DateTimeZone.UTC);
|
39
|
+
}
|
40
|
+
|
41
|
+
@Test
|
42
|
+
public void asBooleanFromBoolean()
|
43
|
+
{
|
44
|
+
assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newBoolean(true)));
|
45
|
+
}
|
46
|
+
|
47
|
+
@Test
|
48
|
+
public void asBooleanFromInteger()
|
49
|
+
{
|
50
|
+
assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newInteger(1)));
|
51
|
+
try {
|
52
|
+
ColumnCaster.asBoolean(ValueFactory.newInteger(2));
|
53
|
+
fail();
|
54
|
+
}
|
55
|
+
catch (Throwable t) {
|
56
|
+
assertTrue(t instanceof DataException);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
@Test
|
61
|
+
public void asBooleanFromFloat()
|
62
|
+
{
|
63
|
+
try {
|
64
|
+
ColumnCaster.asBoolean(ValueFactory.newFloat(1.1));
|
65
|
+
fail();
|
66
|
+
}
|
67
|
+
catch (Throwable t) {
|
68
|
+
assertTrue(t instanceof DataException);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
@Test
|
73
|
+
public void asBooleanFromString()
|
74
|
+
{
|
75
|
+
assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newString("true")));
|
76
|
+
try {
|
77
|
+
ColumnCaster.asBoolean(ValueFactory.newString("foo"));
|
78
|
+
fail();
|
79
|
+
}
|
80
|
+
catch (Throwable t) {
|
81
|
+
assertTrue(t instanceof DataException);
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
@Test
|
86
|
+
public void asBooleanFromJson()
|
87
|
+
{
|
88
|
+
try {
|
89
|
+
ColumnCaster.asBoolean(mapValue);
|
90
|
+
fail();
|
91
|
+
}
|
92
|
+
catch (Throwable t) {
|
93
|
+
assertTrue(t instanceof DataException);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
@Test
|
98
|
+
public void asLongFromBoolean()
|
99
|
+
{
|
100
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
|
101
|
+
}
|
102
|
+
|
103
|
+
@Test
|
104
|
+
public void asLongFromInteger()
|
105
|
+
{
|
106
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
|
107
|
+
}
|
108
|
+
|
109
|
+
@Test
|
110
|
+
public void asLongFromFloat()
|
111
|
+
{
|
112
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
|
113
|
+
}
|
114
|
+
|
115
|
+
@Test
|
116
|
+
public void asLongFromString()
|
117
|
+
{
|
118
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
|
119
|
+
try {
|
120
|
+
ColumnCaster.asLong(ValueFactory.newString("foo"));
|
121
|
+
fail();
|
122
|
+
}
|
123
|
+
catch (Throwable t) {
|
124
|
+
assertTrue(t instanceof DataException);
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
@Test
|
129
|
+
public void asLongFromJson()
|
130
|
+
{
|
131
|
+
try {
|
132
|
+
ColumnCaster.asLong(mapValue);
|
133
|
+
fail();
|
134
|
+
}
|
135
|
+
catch (Throwable t) {
|
136
|
+
assertTrue(t instanceof DataException);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
@Test
|
141
|
+
public void asDoubleFromBoolean()
|
142
|
+
{
|
143
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
|
144
|
+
}
|
145
|
+
|
146
|
+
@Test
|
147
|
+
public void asDoubleFromInteger()
|
148
|
+
{
|
149
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
|
150
|
+
}
|
151
|
+
|
152
|
+
@Test
|
153
|
+
public void asDoubleFromFloat()
|
154
|
+
{
|
155
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
|
156
|
+
}
|
157
|
+
|
158
|
+
@Test
|
159
|
+
public void asDoubleFromString()
|
160
|
+
{
|
161
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
|
162
|
+
try {
|
163
|
+
ColumnCaster.asLong(ValueFactory.newString("foo"));
|
164
|
+
fail();
|
165
|
+
}
|
166
|
+
catch (Throwable t) {
|
167
|
+
assertTrue(t instanceof DataException);
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
@Test
|
172
|
+
public void asDoubleFromJson()
|
173
|
+
{
|
174
|
+
try {
|
175
|
+
ColumnCaster.asLong(mapValue);
|
176
|
+
fail();
|
177
|
+
}
|
178
|
+
catch (Throwable t) {
|
179
|
+
assertTrue(t instanceof DataException);
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
@Test
|
184
|
+
public void asStringFromBoolean()
|
185
|
+
{
|
186
|
+
assertEquals("true", ColumnCaster.asString(ValueFactory.newBoolean(true)));
|
187
|
+
}
|
188
|
+
|
189
|
+
@Test
|
190
|
+
public void asStringFromInteger()
|
191
|
+
{
|
192
|
+
assertEquals("1", ColumnCaster.asString(ValueFactory.newInteger(1)));
|
193
|
+
}
|
194
|
+
|
195
|
+
@Test
|
196
|
+
public void asStringFromFloat()
|
197
|
+
{
|
198
|
+
assertEquals("1.5", ColumnCaster.asString(ValueFactory.newFloat(1.5)));
|
199
|
+
}
|
200
|
+
|
201
|
+
@Test
|
202
|
+
public void asStringFromString()
|
203
|
+
{
|
204
|
+
assertEquals("1", ColumnCaster.asString(ValueFactory.newString("1")));
|
205
|
+
}
|
206
|
+
|
207
|
+
@Test
|
208
|
+
public void asStringFromJson()
|
209
|
+
{
|
210
|
+
assertEquals("{\"k\":\"v\"}", ColumnCaster.asString(mapValue));
|
211
|
+
}
|
212
|
+
|
213
|
+
@Test
|
214
|
+
public void asTimestampFromBoolean()
|
215
|
+
{
|
216
|
+
try {
|
217
|
+
ColumnCaster.asTimestamp(ValueFactory.newBoolean(true), parser);
|
218
|
+
fail();
|
219
|
+
}
|
220
|
+
catch (Throwable t) {
|
221
|
+
assertTrue(t instanceof DataException);
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
@Test
|
226
|
+
public void asTimestampFromInteger()
|
227
|
+
{
|
228
|
+
assertEquals(1, ColumnCaster.asTimestamp(ValueFactory.newInteger(1), parser).getEpochSecond());
|
229
|
+
}
|
230
|
+
|
231
|
+
@Test
|
232
|
+
public void asTimestampFromFloat()
|
233
|
+
{
|
234
|
+
Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
|
235
|
+
assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newFloat(1463084053.5), parser));
|
236
|
+
}
|
237
|
+
|
238
|
+
@Test
|
239
|
+
public void asTimestampFromString()
|
240
|
+
{
|
241
|
+
Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
|
242
|
+
assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newString("2016-05-12 20:14:13.5"), parser));
|
243
|
+
}
|
244
|
+
|
245
|
+
@Test
|
246
|
+
public void asTimestampFromJson()
|
247
|
+
{
|
248
|
+
try {
|
249
|
+
ColumnCaster.asTimestamp(mapValue, parser);
|
250
|
+
fail();
|
251
|
+
}
|
252
|
+
catch (Throwable t) {
|
253
|
+
assertTrue(t instanceof DataException);
|
254
|
+
}
|
255
|
+
}
|
256
|
+
}
|
@@ -0,0 +1,432 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
5
|
+
import com.google.common.collect.Lists;
|
6
|
+
import org.embulk.EmbulkTestRuntime;
|
7
|
+
import org.embulk.config.ConfigException;
|
8
|
+
import org.embulk.config.ConfigSource;
|
9
|
+
import org.embulk.config.TaskSource;
|
10
|
+
import org.embulk.spi.ColumnConfig;
|
11
|
+
import org.embulk.spi.DataException;
|
12
|
+
import org.embulk.spi.Exec;
|
13
|
+
import org.embulk.spi.FileInput;
|
14
|
+
import org.embulk.spi.ParserPlugin;
|
15
|
+
import org.embulk.spi.Schema;
|
16
|
+
import org.embulk.spi.SchemaConfig;
|
17
|
+
import org.embulk.spi.SchemaConfigException;
|
18
|
+
import org.embulk.spi.TestPageBuilderReader;
|
19
|
+
import org.embulk.spi.time.Timestamp;
|
20
|
+
import org.embulk.spi.type.Type;
|
21
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
22
|
+
import org.embulk.spi.util.Newline;
|
23
|
+
import org.embulk.spi.util.Pages;
|
24
|
+
import org.joda.time.DateTimeZone;
|
25
|
+
import org.junit.Before;
|
26
|
+
import org.junit.Rule;
|
27
|
+
import org.junit.Test;
|
28
|
+
import org.msgpack.value.Value;
|
29
|
+
|
30
|
+
import java.io.ByteArrayInputStream;
|
31
|
+
import java.io.IOException;
|
32
|
+
import java.io.InputStream;
|
33
|
+
import java.nio.charset.Charset;
|
34
|
+
import java.nio.charset.StandardCharsets;
|
35
|
+
import java.util.List;
|
36
|
+
|
37
|
+
import static junit.framework.TestCase.assertEquals;
|
38
|
+
import static org.embulk.spi.type.Types.BOOLEAN;
|
39
|
+
import static org.embulk.spi.type.Types.DOUBLE;
|
40
|
+
import static org.embulk.spi.type.Types.JSON;
|
41
|
+
import static org.embulk.spi.type.Types.LONG;
|
42
|
+
import static org.embulk.spi.type.Types.STRING;
|
43
|
+
import static org.embulk.spi.type.Types.TIMESTAMP;
|
44
|
+
import static org.junit.Assert.assertTrue;
|
45
|
+
import static org.msgpack.value.ValueFactory.newArray;
|
46
|
+
import static org.msgpack.value.ValueFactory.newFloat;
|
47
|
+
import static org.msgpack.value.ValueFactory.newInteger;
|
48
|
+
import static org.msgpack.value.ValueFactory.newMap;
|
49
|
+
import static org.msgpack.value.ValueFactory.newString;
|
50
|
+
|
51
|
+
public class TestJoniRegexpParserPlugin
|
52
|
+
{
|
53
|
+
private ConfigSource config;
|
54
|
+
private JoniRegexpParserPlugin plugin;
|
55
|
+
private TestPageBuilderReader.MockPageOutput output;
|
56
|
+
|
57
|
+
// TODO
|
58
|
+
// private static final String RESOURCE_NAME_PREFIX = "org/embulk/parser/joni/";
|
59
|
+
|
60
|
+
@Before
|
61
|
+
public void createResource()
|
62
|
+
{
|
63
|
+
config = config().set("type", "joni_regexp");
|
64
|
+
plugin = new JoniRegexpParserPlugin();
|
65
|
+
recreatePageOutput();
|
66
|
+
}
|
67
|
+
|
68
|
+
@Rule
|
69
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
70
|
+
|
71
|
+
// TODO use TestingEmbulk
|
72
|
+
// @Rule
|
73
|
+
// public TestingEmbulk embulk = TestingEmbulk.builder().build();
|
74
|
+
|
75
|
+
@Test
|
76
|
+
public void basicApacheCombinedLogTest()
|
77
|
+
throws Exception
|
78
|
+
{
|
79
|
+
// TODO Use TestingEmbulk
|
80
|
+
// ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
|
81
|
+
|
82
|
+
SchemaConfig schema = schema(
|
83
|
+
column("host", STRING), column("user", STRING),
|
84
|
+
column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
|
85
|
+
column("method", STRING), column("path", STRING),
|
86
|
+
column("code", STRING), column("size", STRING), column("referer", STRING),
|
87
|
+
column("agent", STRING));
|
88
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
89
|
+
.set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$");
|
90
|
+
|
91
|
+
// config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
92
|
+
|
93
|
+
transaction(config, fileInput(
|
94
|
+
"224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
|
95
|
+
"128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\""
|
96
|
+
));
|
97
|
+
|
98
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
99
|
+
assertEquals(2, records.size());
|
100
|
+
|
101
|
+
Object[] record;
|
102
|
+
{
|
103
|
+
record = records.get(0);
|
104
|
+
assertEquals("224.126.227.109", record[0]);
|
105
|
+
assertEquals("-", record[1]);
|
106
|
+
assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
|
107
|
+
assertEquals("GET", record[3]);
|
108
|
+
assertEquals("/category/games", record[4]);
|
109
|
+
// assertEquals("HTTP/1.1", record[5]);
|
110
|
+
assertEquals("200", record[5]);
|
111
|
+
assertEquals("85", record[6]);
|
112
|
+
assertEquals("-", record[7]);
|
113
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
|
114
|
+
}
|
115
|
+
{
|
116
|
+
record = records.get(1);
|
117
|
+
assertEquals("128.27.132.24", record[0]);
|
118
|
+
assertEquals("bob", record[1]);
|
119
|
+
assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
|
120
|
+
assertEquals("GET", record[3]);
|
121
|
+
assertEquals("/category/health", record[4]);
|
122
|
+
// assertEquals("HTTP/1.1", record[5]);
|
123
|
+
assertEquals("200", record[5]);
|
124
|
+
assertEquals("103", record[6]);
|
125
|
+
assertEquals("/category/electronics?from=20", record[7]);
|
126
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
@Test
|
131
|
+
public void basicApacheCombinedLogTestEnableStopOnInvalidRecord()
|
132
|
+
throws Exception
|
133
|
+
{
|
134
|
+
// TODO Use TestingEmbulk
|
135
|
+
// ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
|
136
|
+
|
137
|
+
SchemaConfig schema = schema(
|
138
|
+
column("host", STRING), column("user", STRING),
|
139
|
+
column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
|
140
|
+
column("method", STRING), column("path", STRING),
|
141
|
+
column("code", STRING), column("size", STRING), column("referer", STRING),
|
142
|
+
column("agent", STRING));
|
143
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
144
|
+
.set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
|
145
|
+
.set("stop_on_invalid_record", false);
|
146
|
+
|
147
|
+
// config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
148
|
+
|
149
|
+
transaction(config, fileInput(
|
150
|
+
"224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
|
151
|
+
"invalid_record1",
|
152
|
+
"128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
|
153
|
+
"invalid_record2"
|
154
|
+
));
|
155
|
+
|
156
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
157
|
+
assertEquals(2, records.size());
|
158
|
+
|
159
|
+
Object[] record;
|
160
|
+
{
|
161
|
+
record = records.get(0);
|
162
|
+
assertEquals("224.126.227.109", record[0]);
|
163
|
+
assertEquals("-", record[1]);
|
164
|
+
assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
|
165
|
+
assertEquals("GET", record[3]);
|
166
|
+
assertEquals("/category/games", record[4]);
|
167
|
+
// assertEquals("HTTP/1.1", record[5]);
|
168
|
+
assertEquals("200", record[5]);
|
169
|
+
assertEquals("85", record[6]);
|
170
|
+
assertEquals("-", record[7]);
|
171
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
|
172
|
+
}
|
173
|
+
{
|
174
|
+
record = records.get(1);
|
175
|
+
assertEquals("128.27.132.24", record[0]);
|
176
|
+
assertEquals("bob", record[1]);
|
177
|
+
assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
|
178
|
+
assertEquals("GET", record[3]);
|
179
|
+
assertEquals("/category/health", record[4]);
|
180
|
+
assertEquals("200", record[5]);
|
181
|
+
assertEquals("103", record[6]);
|
182
|
+
assertEquals("/category/electronics?from=20", record[7]);
|
183
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
@Test(expected = DataException.class)
|
188
|
+
public void checkInvalidRecord()
|
189
|
+
throws Exception
|
190
|
+
{
|
191
|
+
SchemaConfig schema = schema(
|
192
|
+
column("host", STRING), column("user", STRING),
|
193
|
+
column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
|
194
|
+
column("method", STRING), column("path", STRING),
|
195
|
+
column("code", STRING), column("size", STRING), column("referer", STRING),
|
196
|
+
column("agent", STRING));
|
197
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
198
|
+
.set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
|
199
|
+
.set("stop_on_invalid_record", true);
|
200
|
+
|
201
|
+
transaction(config, fileInput(
|
202
|
+
"224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
|
203
|
+
"invalid_record1",
|
204
|
+
"128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
|
205
|
+
"invalid_record2"
|
206
|
+
));
|
207
|
+
}
|
208
|
+
|
209
|
+
@Test
|
210
|
+
public void checkLookahead()
|
211
|
+
throws Exception
|
212
|
+
{
|
213
|
+
|
214
|
+
SchemaConfig schema = schema(
|
215
|
+
column("string1", STRING), column("string2", STRING));
|
216
|
+
|
217
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
218
|
+
.set("format", "^\"(?<string1>.*?)\"(?<!\\.) \"(?<string2>.*?)\"");
|
219
|
+
|
220
|
+
transaction(config, fileInput(
|
221
|
+
"\"This is a \\\"test\\\".\" \"This is a \\\"test\\\".\"]}"));
|
222
|
+
|
223
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
224
|
+
assertEquals(1, records.size());
|
225
|
+
|
226
|
+
Object[] record;
|
227
|
+
{
|
228
|
+
record = records.get(0);
|
229
|
+
assertEquals("This is a \\\"test\\\".", record[0]);
|
230
|
+
assertEquals("This is a \\", record[1]);
|
231
|
+
}
|
232
|
+
}
|
233
|
+
|
234
|
+
@Test(expected = DataException.class)
|
235
|
+
public void checkInvalidFormat()
|
236
|
+
throws Exception
|
237
|
+
{
|
238
|
+
|
239
|
+
SchemaConfig schema = schema(
|
240
|
+
column("date", TIMESTAMP, config().set("format", "%H:%M:%S")));
|
241
|
+
|
242
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
243
|
+
.set("format", "(?<date>\\S+)");
|
244
|
+
|
245
|
+
transaction(config, fileInput(
|
246
|
+
"2017-02-19 This is a test."));
|
247
|
+
}
|
248
|
+
|
249
|
+
@Test
|
250
|
+
public void checkAllColumnTypes()
|
251
|
+
throws Exception
|
252
|
+
{
|
253
|
+
|
254
|
+
SchemaConfig schema = schema(
|
255
|
+
column("bool", BOOLEAN), column("string", STRING),
|
256
|
+
column("time", TIMESTAMP, config().set("format", "%Y-%m-%d %H:%M:%S")),
|
257
|
+
column("long", LONG), column("double", DOUBLE),
|
258
|
+
column("json", JSON));
|
259
|
+
|
260
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
261
|
+
.set("format", "^(?<bool>[^\t]*)\t(?<string>[^\t]*)\t(?<time>[^\t]*)\t*(?<long>[^\t]*)\t(?<double>[^\t]*)\t(?<json>[^\t]*)$");
|
262
|
+
|
263
|
+
transaction(config, fileInput(
|
264
|
+
"true\tマイケル・ジャクソン\t2009-6-25 00:00:00\t456789\t123.456\t{\"name\":\"Michael Jackson\",\"birth\":\"1958-8-29\",\"age\":50,\"Bad World Tour\":4.4,\"album\":[\"Got To Be There\",\"Ben\",\"Music & Me\"]}"));
|
265
|
+
|
266
|
+
Value json = newMap(newString("name"), newString("Michael Jackson"),
|
267
|
+
newString("birth"), newString("1958-8-29"),
|
268
|
+
newString("age"), newInteger(50),
|
269
|
+
newString("Bad World Tour"), newFloat(4.4),
|
270
|
+
newString("album"), newArray(newString("Got To Be There"), newString("Ben"), newString("Music & Me")));
|
271
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
272
|
+
assertEquals(1, records.size());
|
273
|
+
|
274
|
+
Object[] record;
|
275
|
+
{
|
276
|
+
record = records.get(0);
|
277
|
+
assertEquals(true, record[0]);
|
278
|
+
assertEquals("マイケル・ジャクソン", record[1]);
|
279
|
+
assertEquals(Timestamp.ofEpochSecond(1245888000L), record[2]);
|
280
|
+
assertEquals(456789L, record[3]);
|
281
|
+
assertEquals(123.456, record[4]);
|
282
|
+
assertEquals(json, record[5]);
|
283
|
+
}
|
284
|
+
}
|
285
|
+
|
286
|
+
@Test
|
287
|
+
public void checkDefaultValues()
|
288
|
+
{
|
289
|
+
ConfigSource config = Exec.newConfigSource()
|
290
|
+
.set("columns", ImmutableList.of(
|
291
|
+
ImmutableMap.of(
|
292
|
+
"name", "name",
|
293
|
+
"type", "string")))
|
294
|
+
.set("format", "(?<name>a*)");
|
295
|
+
|
296
|
+
JoniRegexpParserPlugin.PluginTask task = config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
297
|
+
|
298
|
+
assertEquals(Charset.forName("utf-8"), task.getCharset());
|
299
|
+
assertEquals(Newline.CRLF, task.getNewline());
|
300
|
+
assertEquals(DateTimeZone.UTC, task.getDefaultTimeZone());
|
301
|
+
assertEquals("%Y-%m-%d %H:%M:%S.%N %z", task.getDefaultTimestampFormat());
|
302
|
+
assertEquals(false, task.getStopOnInvalidRecord());
|
303
|
+
//assertEquals( true, task.getDefaultTypecast());
|
304
|
+
|
305
|
+
}
|
306
|
+
|
307
|
+
@Test(expected = ConfigException.class)
|
308
|
+
public void checkColumnsRequired()
|
309
|
+
{
|
310
|
+
ConfigSource config = Exec.newConfigSource()
|
311
|
+
.set("format", "(?<name>a*)");
|
312
|
+
|
313
|
+
config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
314
|
+
}
|
315
|
+
|
316
|
+
@Test(expected = ConfigException.class)
|
317
|
+
public void checkFormatRequired()
|
318
|
+
{
|
319
|
+
ConfigSource config = Exec.newConfigSource()
|
320
|
+
.set("columns", ImmutableList.of(
|
321
|
+
ImmutableMap.of(
|
322
|
+
"name", "name",
|
323
|
+
"type", "string")));
|
324
|
+
|
325
|
+
config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
326
|
+
}
|
327
|
+
|
328
|
+
@Test // (expected = SchemaConfigException.class)
|
329
|
+
public void checkNamedCaptureColumnNotFound()
|
330
|
+
{
|
331
|
+
ConfigSource config2 = Exec.newConfigSource()
|
332
|
+
.set("columns", ImmutableList.of(
|
333
|
+
ImmutableMap.of(
|
334
|
+
"name", "hoge",
|
335
|
+
"type", "string")))
|
336
|
+
.set("format", "(?<no_capture_name>a*)");
|
337
|
+
|
338
|
+
config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
339
|
+
try {
|
340
|
+
transaction(config2, fileInput(""));
|
341
|
+
}
|
342
|
+
catch (Throwable t) {
|
343
|
+
assertTrue(t instanceof SchemaConfigException);
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
@Test(expected = org.embulk.config.ConfigException.class)
|
348
|
+
public void checkNoNamedCapturingGroupRegex()
|
349
|
+
throws Exception
|
350
|
+
{
|
351
|
+
ConfigSource config2 = Exec.newConfigSource()
|
352
|
+
.set("columns", ImmutableList.of(
|
353
|
+
ImmutableMap.of(
|
354
|
+
"name", "hoge",
|
355
|
+
"type", "string")))
|
356
|
+
.set("format", "no named capturing group regex");
|
357
|
+
|
358
|
+
config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
359
|
+
transaction(config2, fileInput(""));
|
360
|
+
}
|
361
|
+
|
362
|
+
@Test(expected = org.joni.exception.SyntaxException.class)
|
363
|
+
public void checkInvalidRegexSyntax()
|
364
|
+
throws Exception
|
365
|
+
{
|
366
|
+
ConfigSource config2 = Exec.newConfigSource()
|
367
|
+
.set("columns", ImmutableList.of(
|
368
|
+
ImmutableMap.of(
|
369
|
+
"name", "hoge",
|
370
|
+
"type", "string")))
|
371
|
+
.set("format", "(?<invalid_regex");
|
372
|
+
|
373
|
+
config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
374
|
+
transaction(config2, fileInput(""));
|
375
|
+
}
|
376
|
+
|
377
|
+
private void transaction(ConfigSource config, final FileInput input)
|
378
|
+
{
|
379
|
+
plugin.transaction(config, new ParserPlugin.Control()
|
380
|
+
{
|
381
|
+
@Override
|
382
|
+
public void run(TaskSource taskSource, Schema schema)
|
383
|
+
{
|
384
|
+
plugin.run(taskSource, schema, input, output);
|
385
|
+
}
|
386
|
+
});
|
387
|
+
}
|
388
|
+
|
389
|
+
private void recreatePageOutput()
|
390
|
+
{
|
391
|
+
output = new TestPageBuilderReader.MockPageOutput();
|
392
|
+
}
|
393
|
+
|
394
|
+
private FileInput fileInput(String... lines)
|
395
|
+
throws Exception
|
396
|
+
{
|
397
|
+
StringBuilder sb = new StringBuilder();
|
398
|
+
for (String line : lines) {
|
399
|
+
sb.append(line).append("\n");
|
400
|
+
}
|
401
|
+
|
402
|
+
ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8));
|
403
|
+
return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
|
404
|
+
}
|
405
|
+
|
406
|
+
private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
|
407
|
+
throws IOException
|
408
|
+
{
|
409
|
+
return new InputStreamFileInput.IteratorProvider(
|
410
|
+
ImmutableList.copyOf(inputStreams));
|
411
|
+
}
|
412
|
+
|
413
|
+
private ConfigSource config()
|
414
|
+
{
|
415
|
+
return runtime.getExec().newConfigSource();
|
416
|
+
}
|
417
|
+
|
418
|
+
private SchemaConfig schema(ColumnConfig... columns)
|
419
|
+
{
|
420
|
+
return new SchemaConfig(Lists.newArrayList(columns));
|
421
|
+
}
|
422
|
+
|
423
|
+
private ColumnConfig column(String name, Type type)
|
424
|
+
{
|
425
|
+
return column(name, type, config());
|
426
|
+
}
|
427
|
+
|
428
|
+
private ColumnConfig column(String name, Type type, ConfigSource option)
|
429
|
+
{
|
430
|
+
return new ColumnConfig(name, type, option);
|
431
|
+
}
|
432
|
+
}
|