embulk-parser-joni_regexp 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +7 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +107 -0
  6. data/build.gradle +101 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/example/apache.txt +5000 -0
  10. data/example/apache.yml +18 -0
  11. data/example/conf.yml +12 -0
  12. data/example/conf2.yml +12 -0
  13. data/example/conf3.yml +12 -0
  14. data/example/seed.yml +8 -0
  15. data/example/test.txt +8 -0
  16. data/example/test2.txt +3 -0
  17. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  18. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  19. data/gradlew +169 -0
  20. data/gradlew.bat +84 -0
  21. data/lib/embulk/guess/joni_regexp.rb +28 -0
  22. data/lib/embulk/parser/joni_regexp.rb +3 -0
  23. data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
  24. data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
  25. data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
  26. data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
  27. data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
  28. data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
  29. data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
  30. data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
  31. data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
  32. data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
  33. data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
  34. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
  35. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
  36. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
  37. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
  38. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
  39. metadata +112 -0
@@ -0,0 +1,167 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.parser.joni_regexp.JoniRegexpParserPlugin.PluginTask;
5
+ import org.embulk.parser.joni_regexp.JoniRegexpParserPlugin.TypecastColumnOption;
6
+
7
+ import org.embulk.spi.Column;
8
+ import org.embulk.spi.ColumnConfig;
9
+ import org.embulk.spi.ColumnVisitor;
10
+ import org.embulk.spi.PageBuilder;
11
+ import org.embulk.spi.Schema;
12
+ import org.embulk.spi.SchemaConfig;
13
+ import org.embulk.spi.json.JsonParser;
14
+ import org.embulk.spi.time.Timestamp;
15
+ import org.embulk.spi.time.TimestampParser;
16
+ import org.msgpack.core.MessageTypeException;
17
+ import org.msgpack.value.Value;
18
+
19
+ public class ColumnVisitorImpl implements ColumnVisitor
20
+ {
21
+ protected final PluginTask task;
22
+ protected final Schema schema;
23
+ protected final PageBuilder pageBuilder;
24
+ protected final TimestampParser[] timestampParsers;
25
+ protected final Boolean[] autoTypecasts;
26
+
27
+ protected Value value;
28
+
29
+ public ColumnVisitorImpl(PluginTask task, Schema schema, PageBuilder pageBuilder, TimestampParser[] timestampParsers)
30
+ {
31
+ this.task = task;
32
+ this.schema = schema;
33
+ this.pageBuilder = pageBuilder;
34
+ this.timestampParsers = timestampParsers.clone();
35
+ this.autoTypecasts = new Boolean[schema.size()];
36
+ buildAutoTypecasts();
37
+ }
38
+
39
+ private void buildAutoTypecasts()
40
+ {
41
+ for (Column column : schema.getColumns()) {
42
+ this.autoTypecasts[column.getIndex()] = task.getDefaultTypecast();
43
+ }
44
+
45
+ SchemaConfig schemaConfig = task.getColumns();
46
+
47
+ for (ColumnConfig columnConfig : schemaConfig.getColumns()) {
48
+ TypecastColumnOption columnOption = columnConfig.getOption().loadConfig(TypecastColumnOption.class);
49
+ Boolean autoTypecast = columnOption.getTypecast().or(task.getDefaultTypecast());
50
+ Column column = schema.lookupColumn(columnConfig.getName());
51
+ this.autoTypecasts[column.getIndex()] = autoTypecast;
52
+ }
53
+ }
54
+
55
+ public void setValue(Value value)
56
+ {
57
+ this.value = value;
58
+ }
59
+
60
+ @Override
61
+ public void booleanColumn(Column column)
62
+ {
63
+ if (isNil(value)) {
64
+ pageBuilder.setNull(column);
65
+ }
66
+ else {
67
+ try {
68
+ boolean booleanValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asBoolean(value) : value.asBooleanValue().getBoolean();
69
+ pageBuilder.setBoolean(column, booleanValue);
70
+ }
71
+ catch (MessageTypeException e) {
72
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Boolean", value), e);
73
+ }
74
+ }
75
+ }
76
+
77
+ @Override
78
+ public void longColumn(Column column)
79
+ {
80
+ if (isNil(value)) {
81
+ pageBuilder.setNull(column);
82
+ }
83
+ else {
84
+ try {
85
+ long longValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asLong(value) : value.asIntegerValue().toLong();
86
+ pageBuilder.setLong(column, longValue);
87
+ }
88
+ catch (MessageTypeException e) {
89
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Long", value), e);
90
+ }
91
+ }
92
+ }
93
+
94
+ @Override
95
+ public void doubleColumn(Column column)
96
+ {
97
+ if (isNil(value)) {
98
+ pageBuilder.setNull(column);
99
+ }
100
+ else {
101
+ try {
102
+ double doubleValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asDouble(value) : value.asFloatValue().toDouble();
103
+ pageBuilder.setDouble(column, doubleValue);
104
+ }
105
+ catch (MessageTypeException e) {
106
+ throw new JsonRecordValidateException(String.format("failed get \"%s\" as Double", value), e);
107
+ }
108
+ }
109
+ }
110
+
111
+ @Override
112
+ public void stringColumn(Column column)
113
+ {
114
+ if (isNil(value)) {
115
+ pageBuilder.setNull(column);
116
+ }
117
+ else {
118
+ try {
119
+ String string = autoTypecasts[column.getIndex()] ? ColumnCaster.asString(value) : value.asStringValue().toString();
120
+ pageBuilder.setString(column, string);
121
+ }
122
+ catch (MessageTypeException e) {
123
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as String", value), e);
124
+ }
125
+ }
126
+ }
127
+
128
+ @Override
129
+ public void timestampColumn(Column column)
130
+ {
131
+ if (isNil(value)) {
132
+ pageBuilder.setNull(column);
133
+ }
134
+ else {
135
+ try {
136
+ Timestamp timestamp = ColumnCaster.asTimestamp(value, timestampParsers[column.getIndex()]);
137
+ pageBuilder.setTimestamp(column, timestamp);
138
+ }
139
+ catch (MessageTypeException e) {
140
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Timestamp", value), e);
141
+ }
142
+ }
143
+ }
144
+
145
+ @Override
146
+ public void jsonColumn(Column column)
147
+ {
148
+ if (isNil(value)) {
149
+ pageBuilder.setNull(column);
150
+ }
151
+ else {
152
+ try {
153
+ JsonParser parser = new JsonParser();
154
+ Value json = parser.parse(value.toString());
155
+ pageBuilder.setJson(column, json);
156
+ }
157
+ catch (MessageTypeException e) {
158
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Json", value), e);
159
+ }
160
+ }
161
+ }
162
+
163
+ protected boolean isNil(Value v)
164
+ {
165
+ return v == null || v.isNilValue();
166
+ }
167
+ }
@@ -0,0 +1,169 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigException;
7
+ import org.embulk.config.ConfigSource;
8
+ import org.embulk.config.Task;
9
+ import org.embulk.config.TaskSource;
10
+ import org.embulk.spi.Column;
11
+ import org.embulk.spi.DataException;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FileInput;
14
+ import org.embulk.spi.PageBuilder;
15
+ import org.embulk.spi.PageOutput;
16
+ import org.embulk.spi.ParserPlugin;
17
+ import org.embulk.spi.Schema;
18
+ import org.embulk.spi.SchemaConfig;
19
+ import org.embulk.spi.time.TimestampParser;
20
+ import org.embulk.spi.util.LineDecoder;
21
+ import org.embulk.spi.util.Timestamps;
22
+ import org.jcodings.specific.UTF8Encoding;
23
+ import org.joni.Matcher;
24
+ import org.joni.NameEntry;
25
+ import org.joni.Option;
26
+ import org.joni.Regex;
27
+ import org.joni.Region;
28
+ import org.msgpack.value.Value;
29
+ import org.msgpack.value.ValueFactory;
30
+ import org.slf4j.Logger;
31
+
32
+ import java.nio.charset.Charset;
33
+ import java.nio.charset.StandardCharsets;
34
+ import java.util.Iterator;
35
+ import java.util.Locale;
36
+
37
+ public class JoniRegexpParserPlugin
38
+ implements ParserPlugin
39
+ {
40
+ private static final Logger logger = Exec.getLogger(JoniRegexpParserPlugin.class);
41
+
42
+ public interface TypecastColumnOption
43
+ extends Task
44
+ {
45
+ @Config("typecast")
46
+ @ConfigDefault("null")
47
+ public Optional<Boolean> getTypecast();
48
+ }
49
+
50
+ public interface PluginTask
51
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
52
+ {
53
+ @Config("columns")
54
+ SchemaConfig getColumns();
55
+
56
+ @Config("stop_on_invalid_record")
57
+ @ConfigDefault("false")
58
+ boolean getStopOnInvalidRecord();
59
+
60
+ @Config("format")
61
+ String getFormat();
62
+
63
+ @Config("default_typecast")
64
+ @ConfigDefault("true")
65
+ Boolean getDefaultTypecast();
66
+ }
67
+
68
+ @Override
69
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
70
+ {
71
+ PluginTask task = config.loadConfig(PluginTask.class);
72
+
73
+ Schema schema = task.getColumns().toSchema();
74
+
75
+ validateSchema(task, schema);
76
+
77
+ control.run(task.dump(), schema);
78
+ }
79
+
80
+ @Override
81
+ public void run(TaskSource taskSource, Schema schema,
82
+ FileInput input, PageOutput output)
83
+ {
84
+ PluginTask task = taskSource.loadTask(PluginTask.class);
85
+ LineDecoder lineDecoder = new LineDecoder(input, task);
86
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
87
+ TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
88
+
89
+ ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, schema, pageBuilder, timestampParsers);
90
+
91
+ Regex regex = buildRegex(task);
92
+
93
+ while (input.nextFile()) {
94
+ while (true) {
95
+ String line = lineDecoder.poll();
96
+ // logger.debug(String.format(Locale.ENGLISH,"line = %s",line));
97
+ if (line == null) {
98
+ break;
99
+ }
100
+ byte[] line_bytes = line.getBytes(StandardCharsets.UTF_8);
101
+ Matcher matcher = regex.matcher(line_bytes);
102
+ int result = matcher.search(0, line_bytes.length, Option.DEFAULT);
103
+
104
+ if (result != -1) {
105
+ Region region = matcher.getEagerRegion();
106
+ for (Iterator<NameEntry> entry = regex.namedBackrefIterator(); entry.hasNext(); ) {
107
+ NameEntry e = entry.next();
108
+ String name = captureName(e);
109
+
110
+ int number = e.getBackRefs()[0];
111
+ int begin = region.beg[number];
112
+ int end = region.end[number];
113
+ String strValue = new String(line_bytes, begin, end - begin, StandardCharsets.UTF_8);
114
+
115
+ logger.debug(String.format(Locale.ENGLISH, "<%s> = %s", name, strValue));
116
+ setValue(schema, visitor, name, strValue);
117
+ }
118
+ pageBuilder.addRecord();
119
+ }
120
+ else if (task.getStopOnInvalidRecord() == false) {
121
+ logger.warn(String.format(Locale.ENGLISH, "skip unmatched line = %s", line));
122
+ }
123
+ else {
124
+ throw new DataException(String.format("Invalid record at line %s", line));
125
+ }
126
+ }
127
+ }
128
+ pageBuilder.finish();
129
+ }
130
+
131
+ private void setValue(Schema schema, ColumnVisitorImpl visitor, String name, String strValue)
132
+ {
133
+ try {
134
+ Value value = ValueFactory.newString(strValue);
135
+ Column column = schema.lookupColumn(name);
136
+ visitor.setValue(value);
137
+ column.visit(visitor);
138
+ }
139
+ catch (Exception ex) {
140
+ throw new DataException(String.format(Locale.ENGLISH, "Set value failed. column = \"%s\" value = \"%s\", reason = \"%s\"", name, strValue, ex.getMessage()));
141
+ }
142
+ }
143
+
144
+ private Regex buildRegex(PluginTask task)
145
+ {
146
+ String format = task.getFormat();
147
+ byte[] pattern = format.getBytes(StandardCharsets.UTF_8);
148
+ // throw org.joni.exception.SyntaxException if regex is invalid.
149
+ return new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE);
150
+ }
151
+
152
+ private String captureName(NameEntry e)
153
+ {
154
+ return new String(e.name, e.nameP, e.nameEnd - e.nameP);
155
+ }
156
+
157
+ private void validateSchema(PluginTask task, Schema schema)
158
+ {
159
+ Regex regex = buildRegex(task);
160
+ if (regex.numberOfNames() < 1) {
161
+ throw new ConfigException("The regex has no named capturing group");
162
+ }
163
+ for (Iterator<NameEntry> entry = regex.namedBackrefIterator(); entry.hasNext(); ) {
164
+ NameEntry e = entry.next();
165
+ String captureName = captureName(e);
166
+ schema.lookupColumn(captureName); // throw SchemaConfigException;
167
+ }
168
+ }
169
+ }
@@ -0,0 +1,22 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import org.embulk.spi.DataException;
4
+
5
+ public class JsonRecordValidateException
6
+ extends DataException
7
+ {
8
+ public JsonRecordValidateException(String message)
9
+ {
10
+ super(message);
11
+ }
12
+
13
+ public JsonRecordValidateException(String message, Throwable cause)
14
+ {
15
+ super(message, cause);
16
+ }
17
+
18
+ public JsonRecordValidateException(Throwable cause)
19
+ {
20
+ super(cause);
21
+ }
22
+ }
@@ -0,0 +1,39 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ public class BooleanCast
7
+ {
8
+ private BooleanCast() {}
9
+
10
+ private static String buildErrorMessage(String as, boolean value)
11
+ {
12
+ return String.format("cannot cast boolean to %s: \"%s\"", as, value);
13
+ }
14
+
15
+ public static boolean asBoolean(boolean value) throws DataException
16
+ {
17
+ return value;
18
+ }
19
+
20
+ public static long asLong(boolean value) throws DataException
21
+ {
22
+ return value ? 1 : 0;
23
+ }
24
+
25
+ public static double asDouble(boolean value) throws DataException
26
+ {
27
+ throw new DataException(buildErrorMessage("double", value));
28
+ }
29
+
30
+ public static String asString(boolean value) throws DataException
31
+ {
32
+ return value ? "true" : "false";
33
+ }
34
+
35
+ public static Timestamp asTimestamp(boolean value) throws DataException
36
+ {
37
+ throw new DataException(buildErrorMessage("timestamp", value));
38
+ }
39
+ }
@@ -0,0 +1,41 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ public class DoubleCast
7
+ {
8
+ private DoubleCast() {}
9
+
10
+ private static String buildErrorMessage(String as, double value)
11
+ {
12
+ return String.format("cannot cast double to %s: \"%s\"", as, value);
13
+ }
14
+
15
+ public static boolean asBoolean(double value) throws DataException
16
+ {
17
+ throw new DataException(buildErrorMessage("boolean", value));
18
+ }
19
+
20
+ public static long asLong(double value) throws DataException
21
+ {
22
+ return (long) value;
23
+ }
24
+
25
+ public static double asDouble(double value) throws DataException
26
+ {
27
+ return value;
28
+ }
29
+
30
+ public static String asString(double value) throws DataException
31
+ {
32
+ return String.valueOf(value);
33
+ }
34
+
35
+ public static Timestamp asTimestamp(double value) throws DataException
36
+ {
37
+ long epochSecond = (long) value;
38
+ long nanoAdjustMent = (long) ((value - epochSecond) * 1000000000);
39
+ return Timestamp.ofEpochSecond(epochSecond, nanoAdjustMent);
40
+ }
41
+ }
@@ -0,0 +1,40 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+ import org.msgpack.value.Value;
6
+
7
+ public class JsonCast
8
+ {
9
+ private JsonCast() {}
10
+
11
+ private static String buildErrorMessage(String as, Value value)
12
+ {
13
+ return String.format("cannot cast Json to %s: \"%s\"", as, value);
14
+ }
15
+
16
+ public static boolean asBoolean(Value value) throws DataException
17
+ {
18
+ throw new DataException(buildErrorMessage("boolean", value));
19
+ }
20
+
21
+ public static long asLong(Value value) throws DataException
22
+ {
23
+ throw new DataException(buildErrorMessage("long", value));
24
+ }
25
+
26
+ public static double asDouble(Value value) throws DataException
27
+ {
28
+ throw new DataException(buildErrorMessage("double", value));
29
+ }
30
+
31
+ public static String asString(Value value) throws DataException
32
+ {
33
+ return value.toString();
34
+ }
35
+
36
+ public static Timestamp asTimestamp(Value value) throws DataException
37
+ {
38
+ throw new DataException(buildErrorMessage("timestamp", value));
39
+ }
40
+ }