embulk-parser-joni_regexp 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.travis.yml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +107 -0
- data/build.gradle +101 -0
- data/config/checkstyle/checkstyle.xml +128 -0
- data/config/checkstyle/default.xml +108 -0
- data/example/apache.txt +5000 -0
- data/example/apache.yml +18 -0
- data/example/conf.yml +12 -0
- data/example/conf2.yml +12 -0
- data/example/conf3.yml +12 -0
- data/example/seed.yml +8 -0
- data/example/test.txt +8 -0
- data/example/test2.txt +3 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +169 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/guess/joni_regexp.rb +28 -0
- data/lib/embulk/parser/joni_regexp.rb +3 -0
- data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
- data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
- data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
- data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
- data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
- data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
- data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
- data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
- metadata +112 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp.cast;
|
2
|
+
|
3
|
+
import org.embulk.spi.DataException;
|
4
|
+
import org.embulk.spi.time.Timestamp;
|
5
|
+
|
6
|
+
public class LongCast
|
7
|
+
{
|
8
|
+
private LongCast() {}
|
9
|
+
|
10
|
+
private static String buildErrorMessage(String as, long value)
|
11
|
+
{
|
12
|
+
return String.format("cannot cast long to %s: \"%s\"", as, value);
|
13
|
+
}
|
14
|
+
|
15
|
+
public static boolean asBoolean(long value) throws DataException
|
16
|
+
{
|
17
|
+
if (value == 1) {
|
18
|
+
return true;
|
19
|
+
}
|
20
|
+
else if (value == 0) {
|
21
|
+
return false;
|
22
|
+
}
|
23
|
+
else {
|
24
|
+
throw new DataException(buildErrorMessage("boolean", value));
|
25
|
+
}
|
26
|
+
}
|
27
|
+
|
28
|
+
public static long asLong(long value) throws DataException
|
29
|
+
{
|
30
|
+
return value;
|
31
|
+
}
|
32
|
+
|
33
|
+
public static double asDouble(long value) throws DataException
|
34
|
+
{
|
35
|
+
return (double) value;
|
36
|
+
}
|
37
|
+
|
38
|
+
public static String asString(long value) throws DataException
|
39
|
+
{
|
40
|
+
return String.valueOf(value);
|
41
|
+
}
|
42
|
+
|
43
|
+
public static Timestamp asTimestamp(long value) throws DataException
|
44
|
+
{
|
45
|
+
return Timestamp.ofEpochSecond(value);
|
46
|
+
}
|
47
|
+
}
|
@@ -0,0 +1,82 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp.cast;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableSet;
|
4
|
+
import org.embulk.spi.DataException;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
import org.embulk.spi.time.TimestampParseException;
|
7
|
+
import org.embulk.spi.time.TimestampParser;
|
8
|
+
|
9
|
+
public class StringCast
|
10
|
+
{
|
11
|
+
// copy from csv plugin
|
12
|
+
public static final ImmutableSet<String> TRUE_STRINGS =
|
13
|
+
ImmutableSet.of(
|
14
|
+
"true", "True", "TRUE",
|
15
|
+
"yes", "Yes", "YES",
|
16
|
+
"t", "T", "y", "Y",
|
17
|
+
"on", "On", "ON",
|
18
|
+
"1");
|
19
|
+
|
20
|
+
public static final ImmutableSet<String> FALSE_STRINGS =
|
21
|
+
ImmutableSet.of(
|
22
|
+
"false", "False", "FALSE",
|
23
|
+
"no", "No", "NO",
|
24
|
+
"f", "F", "n", "N",
|
25
|
+
"off", "Off", "OFF",
|
26
|
+
"0");
|
27
|
+
|
28
|
+
private StringCast() {}
|
29
|
+
|
30
|
+
private static String buildErrorMessage(String as, String value)
|
31
|
+
{
|
32
|
+
return String.format("cannot cast String to %s: \"%s\"", as, value);
|
33
|
+
}
|
34
|
+
|
35
|
+
public static boolean asBoolean(String value) throws DataException
|
36
|
+
{
|
37
|
+
if (TRUE_STRINGS.contains(value)) {
|
38
|
+
return true;
|
39
|
+
}
|
40
|
+
else if (FALSE_STRINGS.contains(value)) {
|
41
|
+
return false;
|
42
|
+
}
|
43
|
+
else {
|
44
|
+
throw new DataException(buildErrorMessage("boolean", value));
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
public static long asLong(String value) throws DataException
|
49
|
+
{
|
50
|
+
try {
|
51
|
+
return Long.parseLong(value);
|
52
|
+
}
|
53
|
+
catch (NumberFormatException ex) {
|
54
|
+
throw new DataException(buildErrorMessage("long", value), ex);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
public static double asDouble(String value) throws DataException
|
59
|
+
{
|
60
|
+
try {
|
61
|
+
return Double.parseDouble(value);
|
62
|
+
}
|
63
|
+
catch (NumberFormatException ex) {
|
64
|
+
throw new DataException(buildErrorMessage("double", value), ex);
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
public static String asString(String value) throws DataException
|
69
|
+
{
|
70
|
+
return value;
|
71
|
+
}
|
72
|
+
|
73
|
+
public static Timestamp asTimestamp(String value, TimestampParser parser) throws DataException
|
74
|
+
{
|
75
|
+
try {
|
76
|
+
return parser.parse(value);
|
77
|
+
}
|
78
|
+
catch (TimestampParseException ex) {
|
79
|
+
throw new DataException(buildErrorMessage("timestamp", value), ex);
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
@@ -0,0 +1,256 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp;
|
2
|
+
|
3
|
+
import org.embulk.EmbulkTestRuntime;
|
4
|
+
import org.embulk.spi.DataException;
|
5
|
+
import org.embulk.spi.time.Timestamp;
|
6
|
+
import org.embulk.spi.time.TimestampParser;
|
7
|
+
import org.joda.time.DateTimeZone;
|
8
|
+
import org.jruby.embed.ScriptingContainer;
|
9
|
+
import org.junit.Before;
|
10
|
+
import org.junit.Rule;
|
11
|
+
import org.junit.Test;
|
12
|
+
import org.msgpack.value.MapValue;
|
13
|
+
import org.msgpack.value.Value;
|
14
|
+
import org.msgpack.value.ValueFactory;
|
15
|
+
|
16
|
+
import static org.junit.Assert.assertEquals;
|
17
|
+
import static org.junit.Assert.assertTrue;
|
18
|
+
import static org.junit.Assert.fail;
|
19
|
+
|
20
|
+
public class TestColumnCaster
|
21
|
+
{
|
22
|
+
@Rule
|
23
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
24
|
+
public MapValue mapValue;
|
25
|
+
public DataException thrown;
|
26
|
+
public ScriptingContainer jruby;
|
27
|
+
public TimestampParser parser;
|
28
|
+
|
29
|
+
@Before
|
30
|
+
public void createResource()
|
31
|
+
{
|
32
|
+
jruby = new ScriptingContainer();
|
33
|
+
thrown = new DataException("any");
|
34
|
+
Value[] kvs = new Value[2];
|
35
|
+
kvs[0] = ValueFactory.newString("k");
|
36
|
+
kvs[1] = ValueFactory.newString("v");
|
37
|
+
mapValue = ValueFactory.newMap(kvs);
|
38
|
+
parser = new TimestampParser(jruby, "%Y-%m-%d %H:%M:%S.%N", DateTimeZone.UTC);
|
39
|
+
}
|
40
|
+
|
41
|
+
@Test
|
42
|
+
public void asBooleanFromBoolean()
|
43
|
+
{
|
44
|
+
assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newBoolean(true)));
|
45
|
+
}
|
46
|
+
|
47
|
+
@Test
|
48
|
+
public void asBooleanFromInteger()
|
49
|
+
{
|
50
|
+
assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newInteger(1)));
|
51
|
+
try {
|
52
|
+
ColumnCaster.asBoolean(ValueFactory.newInteger(2));
|
53
|
+
fail();
|
54
|
+
}
|
55
|
+
catch (Throwable t) {
|
56
|
+
assertTrue(t instanceof DataException);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
@Test
|
61
|
+
public void asBooleanFromFloat()
|
62
|
+
{
|
63
|
+
try {
|
64
|
+
ColumnCaster.asBoolean(ValueFactory.newFloat(1.1));
|
65
|
+
fail();
|
66
|
+
}
|
67
|
+
catch (Throwable t) {
|
68
|
+
assertTrue(t instanceof DataException);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
@Test
|
73
|
+
public void asBooleanFromString()
|
74
|
+
{
|
75
|
+
assertEquals(true, ColumnCaster.asBoolean(ValueFactory.newString("true")));
|
76
|
+
try {
|
77
|
+
ColumnCaster.asBoolean(ValueFactory.newString("foo"));
|
78
|
+
fail();
|
79
|
+
}
|
80
|
+
catch (Throwable t) {
|
81
|
+
assertTrue(t instanceof DataException);
|
82
|
+
}
|
83
|
+
}
|
84
|
+
|
85
|
+
@Test
|
86
|
+
public void asBooleanFromJson()
|
87
|
+
{
|
88
|
+
try {
|
89
|
+
ColumnCaster.asBoolean(mapValue);
|
90
|
+
fail();
|
91
|
+
}
|
92
|
+
catch (Throwable t) {
|
93
|
+
assertTrue(t instanceof DataException);
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
@Test
|
98
|
+
public void asLongFromBoolean()
|
99
|
+
{
|
100
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
|
101
|
+
}
|
102
|
+
|
103
|
+
@Test
|
104
|
+
public void asLongFromInteger()
|
105
|
+
{
|
106
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
|
107
|
+
}
|
108
|
+
|
109
|
+
@Test
|
110
|
+
public void asLongFromFloat()
|
111
|
+
{
|
112
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
|
113
|
+
}
|
114
|
+
|
115
|
+
@Test
|
116
|
+
public void asLongFromString()
|
117
|
+
{
|
118
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
|
119
|
+
try {
|
120
|
+
ColumnCaster.asLong(ValueFactory.newString("foo"));
|
121
|
+
fail();
|
122
|
+
}
|
123
|
+
catch (Throwable t) {
|
124
|
+
assertTrue(t instanceof DataException);
|
125
|
+
}
|
126
|
+
}
|
127
|
+
|
128
|
+
@Test
|
129
|
+
public void asLongFromJson()
|
130
|
+
{
|
131
|
+
try {
|
132
|
+
ColumnCaster.asLong(mapValue);
|
133
|
+
fail();
|
134
|
+
}
|
135
|
+
catch (Throwable t) {
|
136
|
+
assertTrue(t instanceof DataException);
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
@Test
|
141
|
+
public void asDoubleFromBoolean()
|
142
|
+
{
|
143
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newBoolean(true)));
|
144
|
+
}
|
145
|
+
|
146
|
+
@Test
|
147
|
+
public void asDoubleFromInteger()
|
148
|
+
{
|
149
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newInteger(1)));
|
150
|
+
}
|
151
|
+
|
152
|
+
@Test
|
153
|
+
public void asDoubleFromFloat()
|
154
|
+
{
|
155
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newFloat(1.5)));
|
156
|
+
}
|
157
|
+
|
158
|
+
@Test
|
159
|
+
public void asDoubleFromString()
|
160
|
+
{
|
161
|
+
assertEquals(1, ColumnCaster.asLong(ValueFactory.newString("1")));
|
162
|
+
try {
|
163
|
+
ColumnCaster.asLong(ValueFactory.newString("foo"));
|
164
|
+
fail();
|
165
|
+
}
|
166
|
+
catch (Throwable t) {
|
167
|
+
assertTrue(t instanceof DataException);
|
168
|
+
}
|
169
|
+
}
|
170
|
+
|
171
|
+
@Test
|
172
|
+
public void asDoubleFromJson()
|
173
|
+
{
|
174
|
+
try {
|
175
|
+
ColumnCaster.asLong(mapValue);
|
176
|
+
fail();
|
177
|
+
}
|
178
|
+
catch (Throwable t) {
|
179
|
+
assertTrue(t instanceof DataException);
|
180
|
+
}
|
181
|
+
}
|
182
|
+
|
183
|
+
@Test
|
184
|
+
public void asStringFromBoolean()
|
185
|
+
{
|
186
|
+
assertEquals("true", ColumnCaster.asString(ValueFactory.newBoolean(true)));
|
187
|
+
}
|
188
|
+
|
189
|
+
@Test
|
190
|
+
public void asStringFromInteger()
|
191
|
+
{
|
192
|
+
assertEquals("1", ColumnCaster.asString(ValueFactory.newInteger(1)));
|
193
|
+
}
|
194
|
+
|
195
|
+
@Test
|
196
|
+
public void asStringFromFloat()
|
197
|
+
{
|
198
|
+
assertEquals("1.5", ColumnCaster.asString(ValueFactory.newFloat(1.5)));
|
199
|
+
}
|
200
|
+
|
201
|
+
@Test
|
202
|
+
public void asStringFromString()
|
203
|
+
{
|
204
|
+
assertEquals("1", ColumnCaster.asString(ValueFactory.newString("1")));
|
205
|
+
}
|
206
|
+
|
207
|
+
@Test
|
208
|
+
public void asStringFromJson()
|
209
|
+
{
|
210
|
+
assertEquals("{\"k\":\"v\"}", ColumnCaster.asString(mapValue));
|
211
|
+
}
|
212
|
+
|
213
|
+
@Test
|
214
|
+
public void asTimestampFromBoolean()
|
215
|
+
{
|
216
|
+
try {
|
217
|
+
ColumnCaster.asTimestamp(ValueFactory.newBoolean(true), parser);
|
218
|
+
fail();
|
219
|
+
}
|
220
|
+
catch (Throwable t) {
|
221
|
+
assertTrue(t instanceof DataException);
|
222
|
+
}
|
223
|
+
}
|
224
|
+
|
225
|
+
@Test
|
226
|
+
public void asTimestampFromInteger()
|
227
|
+
{
|
228
|
+
assertEquals(1, ColumnCaster.asTimestamp(ValueFactory.newInteger(1), parser).getEpochSecond());
|
229
|
+
}
|
230
|
+
|
231
|
+
@Test
|
232
|
+
public void asTimestampFromFloat()
|
233
|
+
{
|
234
|
+
Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
|
235
|
+
assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newFloat(1463084053.5), parser));
|
236
|
+
}
|
237
|
+
|
238
|
+
@Test
|
239
|
+
public void asTimestampFromString()
|
240
|
+
{
|
241
|
+
Timestamp expected = Timestamp.ofEpochSecond(1463084053, 500000000);
|
242
|
+
assertEquals(expected, ColumnCaster.asTimestamp(ValueFactory.newString("2016-05-12 20:14:13.5"), parser));
|
243
|
+
}
|
244
|
+
|
245
|
+
@Test
|
246
|
+
public void asTimestampFromJson()
|
247
|
+
{
|
248
|
+
try {
|
249
|
+
ColumnCaster.asTimestamp(mapValue, parser);
|
250
|
+
fail();
|
251
|
+
}
|
252
|
+
catch (Throwable t) {
|
253
|
+
assertTrue(t instanceof DataException);
|
254
|
+
}
|
255
|
+
}
|
256
|
+
}
|
@@ -0,0 +1,432 @@
|
|
1
|
+
package org.embulk.parser.joni_regexp;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
5
|
+
import com.google.common.collect.Lists;
|
6
|
+
import org.embulk.EmbulkTestRuntime;
|
7
|
+
import org.embulk.config.ConfigException;
|
8
|
+
import org.embulk.config.ConfigSource;
|
9
|
+
import org.embulk.config.TaskSource;
|
10
|
+
import org.embulk.spi.ColumnConfig;
|
11
|
+
import org.embulk.spi.DataException;
|
12
|
+
import org.embulk.spi.Exec;
|
13
|
+
import org.embulk.spi.FileInput;
|
14
|
+
import org.embulk.spi.ParserPlugin;
|
15
|
+
import org.embulk.spi.Schema;
|
16
|
+
import org.embulk.spi.SchemaConfig;
|
17
|
+
import org.embulk.spi.SchemaConfigException;
|
18
|
+
import org.embulk.spi.TestPageBuilderReader;
|
19
|
+
import org.embulk.spi.time.Timestamp;
|
20
|
+
import org.embulk.spi.type.Type;
|
21
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
22
|
+
import org.embulk.spi.util.Newline;
|
23
|
+
import org.embulk.spi.util.Pages;
|
24
|
+
import org.joda.time.DateTimeZone;
|
25
|
+
import org.junit.Before;
|
26
|
+
import org.junit.Rule;
|
27
|
+
import org.junit.Test;
|
28
|
+
import org.msgpack.value.Value;
|
29
|
+
|
30
|
+
import java.io.ByteArrayInputStream;
|
31
|
+
import java.io.IOException;
|
32
|
+
import java.io.InputStream;
|
33
|
+
import java.nio.charset.Charset;
|
34
|
+
import java.nio.charset.StandardCharsets;
|
35
|
+
import java.util.List;
|
36
|
+
|
37
|
+
import static junit.framework.TestCase.assertEquals;
|
38
|
+
import static org.embulk.spi.type.Types.BOOLEAN;
|
39
|
+
import static org.embulk.spi.type.Types.DOUBLE;
|
40
|
+
import static org.embulk.spi.type.Types.JSON;
|
41
|
+
import static org.embulk.spi.type.Types.LONG;
|
42
|
+
import static org.embulk.spi.type.Types.STRING;
|
43
|
+
import static org.embulk.spi.type.Types.TIMESTAMP;
|
44
|
+
import static org.junit.Assert.assertTrue;
|
45
|
+
import static org.msgpack.value.ValueFactory.newArray;
|
46
|
+
import static org.msgpack.value.ValueFactory.newFloat;
|
47
|
+
import static org.msgpack.value.ValueFactory.newInteger;
|
48
|
+
import static org.msgpack.value.ValueFactory.newMap;
|
49
|
+
import static org.msgpack.value.ValueFactory.newString;
|
50
|
+
|
51
|
+
public class TestJoniRegexpParserPlugin
|
52
|
+
{
|
53
|
+
private ConfigSource config;
|
54
|
+
private JoniRegexpParserPlugin plugin;
|
55
|
+
private TestPageBuilderReader.MockPageOutput output;
|
56
|
+
|
57
|
+
// TODO
|
58
|
+
// private static final String RESOURCE_NAME_PREFIX = "org/embulk/parser/joni/";
|
59
|
+
|
60
|
+
@Before
|
61
|
+
public void createResource()
|
62
|
+
{
|
63
|
+
config = config().set("type", "joni_regexp");
|
64
|
+
plugin = new JoniRegexpParserPlugin();
|
65
|
+
recreatePageOutput();
|
66
|
+
}
|
67
|
+
|
68
|
+
@Rule
|
69
|
+
public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
|
70
|
+
|
71
|
+
// TODO use TestingEmbulk
|
72
|
+
// @Rule
|
73
|
+
// public TestingEmbulk embulk = TestingEmbulk.builder().build();
|
74
|
+
|
75
|
+
@Test
|
76
|
+
public void basicApacheCombinedLogTest()
|
77
|
+
throws Exception
|
78
|
+
{
|
79
|
+
// TODO Use TestingEmbulk
|
80
|
+
// ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
|
81
|
+
|
82
|
+
SchemaConfig schema = schema(
|
83
|
+
column("host", STRING), column("user", STRING),
|
84
|
+
column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
|
85
|
+
column("method", STRING), column("path", STRING),
|
86
|
+
column("code", STRING), column("size", STRING), column("referer", STRING),
|
87
|
+
column("agent", STRING));
|
88
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
89
|
+
.set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$");
|
90
|
+
|
91
|
+
// config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
92
|
+
|
93
|
+
transaction(config, fileInput(
|
94
|
+
"224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
|
95
|
+
"128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\""
|
96
|
+
));
|
97
|
+
|
98
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
99
|
+
assertEquals(2, records.size());
|
100
|
+
|
101
|
+
Object[] record;
|
102
|
+
{
|
103
|
+
record = records.get(0);
|
104
|
+
assertEquals("224.126.227.109", record[0]);
|
105
|
+
assertEquals("-", record[1]);
|
106
|
+
assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
|
107
|
+
assertEquals("GET", record[3]);
|
108
|
+
assertEquals("/category/games", record[4]);
|
109
|
+
// assertEquals("HTTP/1.1", record[5]);
|
110
|
+
assertEquals("200", record[5]);
|
111
|
+
assertEquals("85", record[6]);
|
112
|
+
assertEquals("-", record[7]);
|
113
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
|
114
|
+
}
|
115
|
+
{
|
116
|
+
record = records.get(1);
|
117
|
+
assertEquals("128.27.132.24", record[0]);
|
118
|
+
assertEquals("bob", record[1]);
|
119
|
+
assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
|
120
|
+
assertEquals("GET", record[3]);
|
121
|
+
assertEquals("/category/health", record[4]);
|
122
|
+
// assertEquals("HTTP/1.1", record[5]);
|
123
|
+
assertEquals("200", record[5]);
|
124
|
+
assertEquals("103", record[6]);
|
125
|
+
assertEquals("/category/electronics?from=20", record[7]);
|
126
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
@Test
|
131
|
+
public void basicApacheCombinedLogTestEnableStopOnInvalidRecord()
|
132
|
+
throws Exception
|
133
|
+
{
|
134
|
+
// TODO Use TestingEmbulk
|
135
|
+
// ConfigSource config = embulk.loadYamlResource(RESOURCE_NAME_PREFIX+"apache.yml");
|
136
|
+
|
137
|
+
SchemaConfig schema = schema(
|
138
|
+
column("host", STRING), column("user", STRING),
|
139
|
+
column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
|
140
|
+
column("method", STRING), column("path", STRING),
|
141
|
+
column("code", STRING), column("size", STRING), column("referer", STRING),
|
142
|
+
column("agent", STRING));
|
143
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
144
|
+
.set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
|
145
|
+
.set("stop_on_invalid_record", false);
|
146
|
+
|
147
|
+
// config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
148
|
+
|
149
|
+
transaction(config, fileInput(
|
150
|
+
"224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
|
151
|
+
"invalid_record1",
|
152
|
+
"128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
|
153
|
+
"invalid_record2"
|
154
|
+
));
|
155
|
+
|
156
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
157
|
+
assertEquals(2, records.size());
|
158
|
+
|
159
|
+
Object[] record;
|
160
|
+
{
|
161
|
+
record = records.get(0);
|
162
|
+
assertEquals("224.126.227.109", record[0]);
|
163
|
+
assertEquals("-", record[1]);
|
164
|
+
assertEquals(Timestamp.ofEpochSecond(1486983892L), record[2]);
|
165
|
+
assertEquals("GET", record[3]);
|
166
|
+
assertEquals("/category/games", record[4]);
|
167
|
+
// assertEquals("HTTP/1.1", record[5]);
|
168
|
+
assertEquals("200", record[5]);
|
169
|
+
assertEquals("85", record[6]);
|
170
|
+
assertEquals("-", record[7]);
|
171
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", record[8]);
|
172
|
+
}
|
173
|
+
{
|
174
|
+
record = records.get(1);
|
175
|
+
assertEquals("128.27.132.24", record[0]);
|
176
|
+
assertEquals("bob", record[1]);
|
177
|
+
assertEquals(Timestamp.ofEpochSecond(1486983893L), record[2]);
|
178
|
+
assertEquals("GET", record[3]);
|
179
|
+
assertEquals("/category/health", record[4]);
|
180
|
+
assertEquals("200", record[5]);
|
181
|
+
assertEquals("103", record[6]);
|
182
|
+
assertEquals("/category/electronics?from=20", record[7]);
|
183
|
+
assertEquals("Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1", record[8]);
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
@Test(expected = DataException.class)
|
188
|
+
public void checkInvalidRecord()
|
189
|
+
throws Exception
|
190
|
+
{
|
191
|
+
SchemaConfig schema = schema(
|
192
|
+
column("host", STRING), column("user", STRING),
|
193
|
+
column("time", TIMESTAMP, config().set("format", "%d/%b/%Y:%H:%M:%S %z")),
|
194
|
+
column("method", STRING), column("path", STRING),
|
195
|
+
column("code", STRING), column("size", STRING), column("referer", STRING),
|
196
|
+
column("agent", STRING));
|
197
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
198
|
+
.set("format", "^(?<host>[^ ]*) [^ ]* (?<user>[^ ]*) \\[(?<time>[^\\]]*)\\] \"(?<method>\\S+)(?: +(?<path>[^ ]*) +\\S*)?\" (?<code>[^ ]*) (?<size>[^ ]*)(?: \"(?<referer>[^\\\"]*)\" \"(?<agent>[^\\\"]*)\")?$")
|
199
|
+
.set("stop_on_invalid_record", true);
|
200
|
+
|
201
|
+
transaction(config, fileInput(
|
202
|
+
"224.126.227.109 - - [13/Feb/2017:20:04:52 +0900] \"GET /category/games HTTP/1.1\" 200 85 \"-\" \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11\"",
|
203
|
+
"invalid_record1",
|
204
|
+
"128.27.132.24 - bob [13/Feb/2017:20:04:53 +0900] \"GET /category/health HTTP/1.1\" 200 103 \"/category/electronics?from=20\" \"Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/20100101 Firefox/10.0.1\"",
|
205
|
+
"invalid_record2"
|
206
|
+
));
|
207
|
+
}
|
208
|
+
|
209
|
+
@Test
|
210
|
+
public void checkLookahead()
|
211
|
+
throws Exception
|
212
|
+
{
|
213
|
+
|
214
|
+
SchemaConfig schema = schema(
|
215
|
+
column("string1", STRING), column("string2", STRING));
|
216
|
+
|
217
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
218
|
+
.set("format", "^\"(?<string1>.*?)\"(?<!\\.) \"(?<string2>.*?)\"");
|
219
|
+
|
220
|
+
transaction(config, fileInput(
|
221
|
+
"\"This is a \\\"test\\\".\" \"This is a \\\"test\\\".\"]}"));
|
222
|
+
|
223
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
224
|
+
assertEquals(1, records.size());
|
225
|
+
|
226
|
+
Object[] record;
|
227
|
+
{
|
228
|
+
record = records.get(0);
|
229
|
+
assertEquals("This is a \\\"test\\\".", record[0]);
|
230
|
+
assertEquals("This is a \\", record[1]);
|
231
|
+
}
|
232
|
+
}
|
233
|
+
|
234
|
+
@Test(expected = DataException.class)
|
235
|
+
public void checkInvalidFormat()
|
236
|
+
throws Exception
|
237
|
+
{
|
238
|
+
|
239
|
+
SchemaConfig schema = schema(
|
240
|
+
column("date", TIMESTAMP, config().set("format", "%H:%M:%S")));
|
241
|
+
|
242
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
243
|
+
.set("format", "(?<date>\\S+)");
|
244
|
+
|
245
|
+
transaction(config, fileInput(
|
246
|
+
"2017-02-19 This is a test."));
|
247
|
+
}
|
248
|
+
|
249
|
+
@Test
|
250
|
+
public void checkAllColumnTypes()
|
251
|
+
throws Exception
|
252
|
+
{
|
253
|
+
|
254
|
+
SchemaConfig schema = schema(
|
255
|
+
column("bool", BOOLEAN), column("string", STRING),
|
256
|
+
column("time", TIMESTAMP, config().set("format", "%Y-%m-%d %H:%M:%S")),
|
257
|
+
column("long", LONG), column("double", DOUBLE),
|
258
|
+
column("json", JSON));
|
259
|
+
|
260
|
+
ConfigSource config = this.config.deepCopy().set("columns", schema)
|
261
|
+
.set("format", "^(?<bool>[^\t]*)\t(?<string>[^\t]*)\t(?<time>[^\t]*)\t*(?<long>[^\t]*)\t(?<double>[^\t]*)\t(?<json>[^\t]*)$");
|
262
|
+
|
263
|
+
transaction(config, fileInput(
|
264
|
+
"true\tマイケル・ジャクソン\t2009-6-25 00:00:00\t456789\t123.456\t{\"name\":\"Michael Jackson\",\"birth\":\"1958-8-29\",\"age\":50,\"Bad World Tour\":4.4,\"album\":[\"Got To Be There\",\"Ben\",\"Music & Me\"]}"));
|
265
|
+
|
266
|
+
Value json = newMap(newString("name"), newString("Michael Jackson"),
|
267
|
+
newString("birth"), newString("1958-8-29"),
|
268
|
+
newString("age"), newInteger(50),
|
269
|
+
newString("Bad World Tour"), newFloat(4.4),
|
270
|
+
newString("album"), newArray(newString("Got To Be There"), newString("Ben"), newString("Music & Me")));
|
271
|
+
List<Object[]> records = Pages.toObjects(schema.toSchema(), output.pages);
|
272
|
+
assertEquals(1, records.size());
|
273
|
+
|
274
|
+
Object[] record;
|
275
|
+
{
|
276
|
+
record = records.get(0);
|
277
|
+
assertEquals(true, record[0]);
|
278
|
+
assertEquals("マイケル・ジャクソン", record[1]);
|
279
|
+
assertEquals(Timestamp.ofEpochSecond(1245888000L), record[2]);
|
280
|
+
assertEquals(456789L, record[3]);
|
281
|
+
assertEquals(123.456, record[4]);
|
282
|
+
assertEquals(json, record[5]);
|
283
|
+
}
|
284
|
+
}
|
285
|
+
|
286
|
+
@Test
|
287
|
+
public void checkDefaultValues()
|
288
|
+
{
|
289
|
+
ConfigSource config = Exec.newConfigSource()
|
290
|
+
.set("columns", ImmutableList.of(
|
291
|
+
ImmutableMap.of(
|
292
|
+
"name", "name",
|
293
|
+
"type", "string")))
|
294
|
+
.set("format", "(?<name>a*)");
|
295
|
+
|
296
|
+
JoniRegexpParserPlugin.PluginTask task = config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
297
|
+
|
298
|
+
assertEquals(Charset.forName("utf-8"), task.getCharset());
|
299
|
+
assertEquals(Newline.CRLF, task.getNewline());
|
300
|
+
assertEquals(DateTimeZone.UTC, task.getDefaultTimeZone());
|
301
|
+
assertEquals("%Y-%m-%d %H:%M:%S.%N %z", task.getDefaultTimestampFormat());
|
302
|
+
assertEquals(false, task.getStopOnInvalidRecord());
|
303
|
+
//assertEquals( true, task.getDefaultTypecast());
|
304
|
+
|
305
|
+
}
|
306
|
+
|
307
|
+
@Test(expected = ConfigException.class)
|
308
|
+
public void checkColumnsRequired()
|
309
|
+
{
|
310
|
+
ConfigSource config = Exec.newConfigSource()
|
311
|
+
.set("format", "(?<name>a*)");
|
312
|
+
|
313
|
+
config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
314
|
+
}
|
315
|
+
|
316
|
+
@Test(expected = ConfigException.class)
|
317
|
+
public void checkFormatRequired()
|
318
|
+
{
|
319
|
+
ConfigSource config = Exec.newConfigSource()
|
320
|
+
.set("columns", ImmutableList.of(
|
321
|
+
ImmutableMap.of(
|
322
|
+
"name", "name",
|
323
|
+
"type", "string")));
|
324
|
+
|
325
|
+
config.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
326
|
+
}
|
327
|
+
|
328
|
+
@Test // (expected = SchemaConfigException.class)
|
329
|
+
public void checkNamedCaptureColumnNotFound()
|
330
|
+
{
|
331
|
+
ConfigSource config2 = Exec.newConfigSource()
|
332
|
+
.set("columns", ImmutableList.of(
|
333
|
+
ImmutableMap.of(
|
334
|
+
"name", "hoge",
|
335
|
+
"type", "string")))
|
336
|
+
.set("format", "(?<no_capture_name>a*)");
|
337
|
+
|
338
|
+
config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
339
|
+
try {
|
340
|
+
transaction(config2, fileInput(""));
|
341
|
+
}
|
342
|
+
catch (Throwable t) {
|
343
|
+
assertTrue(t instanceof SchemaConfigException);
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
@Test(expected = org.embulk.config.ConfigException.class)
|
348
|
+
public void checkNoNamedCapturingGroupRegex()
|
349
|
+
throws Exception
|
350
|
+
{
|
351
|
+
ConfigSource config2 = Exec.newConfigSource()
|
352
|
+
.set("columns", ImmutableList.of(
|
353
|
+
ImmutableMap.of(
|
354
|
+
"name", "hoge",
|
355
|
+
"type", "string")))
|
356
|
+
.set("format", "no named capturing group regex");
|
357
|
+
|
358
|
+
config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
359
|
+
transaction(config2, fileInput(""));
|
360
|
+
}
|
361
|
+
|
362
|
+
@Test(expected = org.joni.exception.SyntaxException.class)
|
363
|
+
public void checkInvalidRegexSyntax()
|
364
|
+
throws Exception
|
365
|
+
{
|
366
|
+
ConfigSource config2 = Exec.newConfigSource()
|
367
|
+
.set("columns", ImmutableList.of(
|
368
|
+
ImmutableMap.of(
|
369
|
+
"name", "hoge",
|
370
|
+
"type", "string")))
|
371
|
+
.set("format", "(?<invalid_regex");
|
372
|
+
|
373
|
+
config2.loadConfig(JoniRegexpParserPlugin.PluginTask.class);
|
374
|
+
transaction(config2, fileInput(""));
|
375
|
+
}
|
376
|
+
|
377
|
+
private void transaction(ConfigSource config, final FileInput input)
|
378
|
+
{
|
379
|
+
plugin.transaction(config, new ParserPlugin.Control()
|
380
|
+
{
|
381
|
+
@Override
|
382
|
+
public void run(TaskSource taskSource, Schema schema)
|
383
|
+
{
|
384
|
+
plugin.run(taskSource, schema, input, output);
|
385
|
+
}
|
386
|
+
});
|
387
|
+
}
|
388
|
+
|
389
|
+
private void recreatePageOutput()
|
390
|
+
{
|
391
|
+
output = new TestPageBuilderReader.MockPageOutput();
|
392
|
+
}
|
393
|
+
|
394
|
+
private FileInput fileInput(String... lines)
|
395
|
+
throws Exception
|
396
|
+
{
|
397
|
+
StringBuilder sb = new StringBuilder();
|
398
|
+
for (String line : lines) {
|
399
|
+
sb.append(line).append("\n");
|
400
|
+
}
|
401
|
+
|
402
|
+
ByteArrayInputStream in = new ByteArrayInputStream(sb.toString().getBytes(StandardCharsets.UTF_8));
|
403
|
+
return new InputStreamFileInput(runtime.getBufferAllocator(), provider(in));
|
404
|
+
}
|
405
|
+
|
406
|
+
private InputStreamFileInput.IteratorProvider provider(InputStream... inputStreams)
|
407
|
+
throws IOException
|
408
|
+
{
|
409
|
+
return new InputStreamFileInput.IteratorProvider(
|
410
|
+
ImmutableList.copyOf(inputStreams));
|
411
|
+
}
|
412
|
+
|
413
|
+
private ConfigSource config()
|
414
|
+
{
|
415
|
+
return runtime.getExec().newConfigSource();
|
416
|
+
}
|
417
|
+
|
418
|
+
private SchemaConfig schema(ColumnConfig... columns)
|
419
|
+
{
|
420
|
+
return new SchemaConfig(Lists.newArrayList(columns));
|
421
|
+
}
|
422
|
+
|
423
|
+
private ColumnConfig column(String name, Type type)
|
424
|
+
{
|
425
|
+
return column(name, type, config());
|
426
|
+
}
|
427
|
+
|
428
|
+
private ColumnConfig column(String name, Type type, ConfigSource option)
|
429
|
+
{
|
430
|
+
return new ColumnConfig(name, type, option);
|
431
|
+
}
|
432
|
+
}
|