embulk-parser-joni_regexp 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +13 -0
  3. data/.travis.yml +7 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +107 -0
  6. data/build.gradle +101 -0
  7. data/config/checkstyle/checkstyle.xml +128 -0
  8. data/config/checkstyle/default.xml +108 -0
  9. data/example/apache.txt +5000 -0
  10. data/example/apache.yml +18 -0
  11. data/example/conf.yml +12 -0
  12. data/example/conf2.yml +12 -0
  13. data/example/conf3.yml +12 -0
  14. data/example/seed.yml +8 -0
  15. data/example/test.txt +8 -0
  16. data/example/test2.txt +3 -0
  17. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  18. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  19. data/gradlew +169 -0
  20. data/gradlew.bat +84 -0
  21. data/lib/embulk/guess/joni_regexp.rb +28 -0
  22. data/lib/embulk/parser/joni_regexp.rb +3 -0
  23. data/src/main/java/org/embulk/parser/joni_regexp/ColumnCaster.java +97 -0
  24. data/src/main/java/org/embulk/parser/joni_regexp/ColumnVisitorImpl.java +167 -0
  25. data/src/main/java/org/embulk/parser/joni_regexp/JoniRegexpParserPlugin.java +169 -0
  26. data/src/main/java/org/embulk/parser/joni_regexp/JsonRecordValidateException.java +22 -0
  27. data/src/main/java/org/embulk/parser/joni_regexp/cast/BooleanCast.java +39 -0
  28. data/src/main/java/org/embulk/parser/joni_regexp/cast/DoubleCast.java +41 -0
  29. data/src/main/java/org/embulk/parser/joni_regexp/cast/JsonCast.java +40 -0
  30. data/src/main/java/org/embulk/parser/joni_regexp/cast/LongCast.java +47 -0
  31. data/src/main/java/org/embulk/parser/joni_regexp/cast/StringCast.java +82 -0
  32. data/src/test/java/org/embulk/parser/joni_regexp/TestColumnCaster.java +256 -0
  33. data/src/test/java/org/embulk/parser/joni_regexp/TestJoniRegexpParserPlugin.java +432 -0
  34. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestBooleanCast.java +56 -0
  35. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestDoubleCast.java +49 -0
  36. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestJsonCast.java +79 -0
  37. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestLongCast.java +41 -0
  38. data/src/test/java/org/embulk/parser/joni_regexp/cast/TestStringCast.java +103 -0
  39. metadata +112 -0
@@ -0,0 +1,167 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.parser.joni_regexp.JoniRegexpParserPlugin.PluginTask;
5
+ import org.embulk.parser.joni_regexp.JoniRegexpParserPlugin.TypecastColumnOption;
6
+
7
+ import org.embulk.spi.Column;
8
+ import org.embulk.spi.ColumnConfig;
9
+ import org.embulk.spi.ColumnVisitor;
10
+ import org.embulk.spi.PageBuilder;
11
+ import org.embulk.spi.Schema;
12
+ import org.embulk.spi.SchemaConfig;
13
+ import org.embulk.spi.json.JsonParser;
14
+ import org.embulk.spi.time.Timestamp;
15
+ import org.embulk.spi.time.TimestampParser;
16
+ import org.msgpack.core.MessageTypeException;
17
+ import org.msgpack.value.Value;
18
+
19
+ public class ColumnVisitorImpl implements ColumnVisitor
20
+ {
21
+ protected final PluginTask task;
22
+ protected final Schema schema;
23
+ protected final PageBuilder pageBuilder;
24
+ protected final TimestampParser[] timestampParsers;
25
+ protected final Boolean[] autoTypecasts;
26
+
27
+ protected Value value;
28
+
29
+ public ColumnVisitorImpl(PluginTask task, Schema schema, PageBuilder pageBuilder, TimestampParser[] timestampParsers)
30
+ {
31
+ this.task = task;
32
+ this.schema = schema;
33
+ this.pageBuilder = pageBuilder;
34
+ this.timestampParsers = timestampParsers.clone();
35
+ this.autoTypecasts = new Boolean[schema.size()];
36
+ buildAutoTypecasts();
37
+ }
38
+
39
+ private void buildAutoTypecasts()
40
+ {
41
+ for (Column column : schema.getColumns()) {
42
+ this.autoTypecasts[column.getIndex()] = task.getDefaultTypecast();
43
+ }
44
+
45
+ SchemaConfig schemaConfig = task.getColumns();
46
+
47
+ for (ColumnConfig columnConfig : schemaConfig.getColumns()) {
48
+ TypecastColumnOption columnOption = columnConfig.getOption().loadConfig(TypecastColumnOption.class);
49
+ Boolean autoTypecast = columnOption.getTypecast().or(task.getDefaultTypecast());
50
+ Column column = schema.lookupColumn(columnConfig.getName());
51
+ this.autoTypecasts[column.getIndex()] = autoTypecast;
52
+ }
53
+ }
54
+
55
+ public void setValue(Value value)
56
+ {
57
+ this.value = value;
58
+ }
59
+
60
+ @Override
61
+ public void booleanColumn(Column column)
62
+ {
63
+ if (isNil(value)) {
64
+ pageBuilder.setNull(column);
65
+ }
66
+ else {
67
+ try {
68
+ boolean booleanValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asBoolean(value) : value.asBooleanValue().getBoolean();
69
+ pageBuilder.setBoolean(column, booleanValue);
70
+ }
71
+ catch (MessageTypeException e) {
72
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Boolean", value), e);
73
+ }
74
+ }
75
+ }
76
+
77
+ @Override
78
+ public void longColumn(Column column)
79
+ {
80
+ if (isNil(value)) {
81
+ pageBuilder.setNull(column);
82
+ }
83
+ else {
84
+ try {
85
+ long longValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asLong(value) : value.asIntegerValue().toLong();
86
+ pageBuilder.setLong(column, longValue);
87
+ }
88
+ catch (MessageTypeException e) {
89
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Long", value), e);
90
+ }
91
+ }
92
+ }
93
+
94
+ @Override
95
+ public void doubleColumn(Column column)
96
+ {
97
+ if (isNil(value)) {
98
+ pageBuilder.setNull(column);
99
+ }
100
+ else {
101
+ try {
102
+ double doubleValue = autoTypecasts[column.getIndex()] ? ColumnCaster.asDouble(value) : value.asFloatValue().toDouble();
103
+ pageBuilder.setDouble(column, doubleValue);
104
+ }
105
+ catch (MessageTypeException e) {
106
+ throw new JsonRecordValidateException(String.format("failed get \"%s\" as Double", value), e);
107
+ }
108
+ }
109
+ }
110
+
111
+ @Override
112
+ public void stringColumn(Column column)
113
+ {
114
+ if (isNil(value)) {
115
+ pageBuilder.setNull(column);
116
+ }
117
+ else {
118
+ try {
119
+ String string = autoTypecasts[column.getIndex()] ? ColumnCaster.asString(value) : value.asStringValue().toString();
120
+ pageBuilder.setString(column, string);
121
+ }
122
+ catch (MessageTypeException e) {
123
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as String", value), e);
124
+ }
125
+ }
126
+ }
127
+
128
+ @Override
129
+ public void timestampColumn(Column column)
130
+ {
131
+ if (isNil(value)) {
132
+ pageBuilder.setNull(column);
133
+ }
134
+ else {
135
+ try {
136
+ Timestamp timestamp = ColumnCaster.asTimestamp(value, timestampParsers[column.getIndex()]);
137
+ pageBuilder.setTimestamp(column, timestamp);
138
+ }
139
+ catch (MessageTypeException e) {
140
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Timestamp", value), e);
141
+ }
142
+ }
143
+ }
144
+
145
+ @Override
146
+ public void jsonColumn(Column column)
147
+ {
148
+ if (isNil(value)) {
149
+ pageBuilder.setNull(column);
150
+ }
151
+ else {
152
+ try {
153
+ JsonParser parser = new JsonParser();
154
+ Value json = parser.parse(value.toString());
155
+ pageBuilder.setJson(column, json);
156
+ }
157
+ catch (MessageTypeException e) {
158
+ throw new JsonRecordValidateException(String.format("failed to get \"%s\" as Json", value), e);
159
+ }
160
+ }
161
+ }
162
+
163
+ protected boolean isNil(Value v)
164
+ {
165
+ return v == null || v.isNilValue();
166
+ }
167
+ }
@@ -0,0 +1,169 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import com.google.common.base.Optional;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigException;
7
+ import org.embulk.config.ConfigSource;
8
+ import org.embulk.config.Task;
9
+ import org.embulk.config.TaskSource;
10
+ import org.embulk.spi.Column;
11
+ import org.embulk.spi.DataException;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FileInput;
14
+ import org.embulk.spi.PageBuilder;
15
+ import org.embulk.spi.PageOutput;
16
+ import org.embulk.spi.ParserPlugin;
17
+ import org.embulk.spi.Schema;
18
+ import org.embulk.spi.SchemaConfig;
19
+ import org.embulk.spi.time.TimestampParser;
20
+ import org.embulk.spi.util.LineDecoder;
21
+ import org.embulk.spi.util.Timestamps;
22
+ import org.jcodings.specific.UTF8Encoding;
23
+ import org.joni.Matcher;
24
+ import org.joni.NameEntry;
25
+ import org.joni.Option;
26
+ import org.joni.Regex;
27
+ import org.joni.Region;
28
+ import org.msgpack.value.Value;
29
+ import org.msgpack.value.ValueFactory;
30
+ import org.slf4j.Logger;
31
+
32
+ import java.nio.charset.Charset;
33
+ import java.nio.charset.StandardCharsets;
34
+ import java.util.Iterator;
35
+ import java.util.Locale;
36
+
37
+ public class JoniRegexpParserPlugin
38
+ implements ParserPlugin
39
+ {
40
+ private static final Logger logger = Exec.getLogger(JoniRegexpParserPlugin.class);
41
+
42
+ public interface TypecastColumnOption
43
+ extends Task
44
+ {
45
+ @Config("typecast")
46
+ @ConfigDefault("null")
47
+ public Optional<Boolean> getTypecast();
48
+ }
49
+
50
+ public interface PluginTask
51
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
52
+ {
53
+ @Config("columns")
54
+ SchemaConfig getColumns();
55
+
56
+ @Config("stop_on_invalid_record")
57
+ @ConfigDefault("false")
58
+ boolean getStopOnInvalidRecord();
59
+
60
+ @Config("format")
61
+ String getFormat();
62
+
63
+ @Config("default_typecast")
64
+ @ConfigDefault("true")
65
+ Boolean getDefaultTypecast();
66
+ }
67
+
68
+ @Override
69
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
70
+ {
71
+ PluginTask task = config.loadConfig(PluginTask.class);
72
+
73
+ Schema schema = task.getColumns().toSchema();
74
+
75
+ validateSchema(task, schema);
76
+
77
+ control.run(task.dump(), schema);
78
+ }
79
+
80
+ @Override
81
+ public void run(TaskSource taskSource, Schema schema,
82
+ FileInput input, PageOutput output)
83
+ {
84
+ PluginTask task = taskSource.loadTask(PluginTask.class);
85
+ LineDecoder lineDecoder = new LineDecoder(input, task);
86
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
87
+ TimestampParser[] timestampParsers = Timestamps.newTimestampColumnParsers(task, task.getColumns());
88
+
89
+ ColumnVisitorImpl visitor = new ColumnVisitorImpl(task, schema, pageBuilder, timestampParsers);
90
+
91
+ Regex regex = buildRegex(task);
92
+
93
+ while (input.nextFile()) {
94
+ while (true) {
95
+ String line = lineDecoder.poll();
96
+ // logger.debug(String.format(Locale.ENGLISH,"line = %s",line));
97
+ if (line == null) {
98
+ break;
99
+ }
100
+ byte[] line_bytes = line.getBytes(StandardCharsets.UTF_8);
101
+ Matcher matcher = regex.matcher(line_bytes);
102
+ int result = matcher.search(0, line_bytes.length, Option.DEFAULT);
103
+
104
+ if (result != -1) {
105
+ Region region = matcher.getEagerRegion();
106
+ for (Iterator<NameEntry> entry = regex.namedBackrefIterator(); entry.hasNext(); ) {
107
+ NameEntry e = entry.next();
108
+ String name = captureName(e);
109
+
110
+ int number = e.getBackRefs()[0];
111
+ int begin = region.beg[number];
112
+ int end = region.end[number];
113
+ String strValue = new String(line_bytes, begin, end - begin, StandardCharsets.UTF_8);
114
+
115
+ logger.debug(String.format(Locale.ENGLISH, "<%s> = %s", name, strValue));
116
+ setValue(schema, visitor, name, strValue);
117
+ }
118
+ pageBuilder.addRecord();
119
+ }
120
+ else if (task.getStopOnInvalidRecord() == false) {
121
+ logger.warn(String.format(Locale.ENGLISH, "skip unmatched line = %s", line));
122
+ }
123
+ else {
124
+ throw new DataException(String.format("Invalid record at line %s", line));
125
+ }
126
+ }
127
+ }
128
+ pageBuilder.finish();
129
+ }
130
+
131
+ private void setValue(Schema schema, ColumnVisitorImpl visitor, String name, String strValue)
132
+ {
133
+ try {
134
+ Value value = ValueFactory.newString(strValue);
135
+ Column column = schema.lookupColumn(name);
136
+ visitor.setValue(value);
137
+ column.visit(visitor);
138
+ }
139
+ catch (Exception ex) {
140
+ throw new DataException(String.format(Locale.ENGLISH, "Set value failed. column = \"%s\" value = \"%s\", reason = \"%s\"", name, strValue, ex.getMessage()));
141
+ }
142
+ }
143
+
144
+ private Regex buildRegex(PluginTask task)
145
+ {
146
+ String format = task.getFormat();
147
+ byte[] pattern = format.getBytes(StandardCharsets.UTF_8);
148
+ // throw org.joni.exception.SyntaxException if regex is invalid.
149
+ return new Regex(pattern, 0, pattern.length, Option.NONE, UTF8Encoding.INSTANCE);
150
+ }
151
+
152
+ private String captureName(NameEntry e)
153
+ {
154
+ return new String(e.name, e.nameP, e.nameEnd - e.nameP);
155
+ }
156
+
157
+ private void validateSchema(PluginTask task, Schema schema)
158
+ {
159
+ Regex regex = buildRegex(task);
160
+ if (regex.numberOfNames() < 1) {
161
+ throw new ConfigException("The regex has no named capturing group");
162
+ }
163
+ for (Iterator<NameEntry> entry = regex.namedBackrefIterator(); entry.hasNext(); ) {
164
+ NameEntry e = entry.next();
165
+ String captureName = captureName(e);
166
+ schema.lookupColumn(captureName); // throw SchemaConfigException;
167
+ }
168
+ }
169
+ }
@@ -0,0 +1,22 @@
1
+ package org.embulk.parser.joni_regexp;
2
+
3
+ import org.embulk.spi.DataException;
4
+
5
+ public class JsonRecordValidateException
6
+ extends DataException
7
+ {
8
+ public JsonRecordValidateException(String message)
9
+ {
10
+ super(message);
11
+ }
12
+
13
+ public JsonRecordValidateException(String message, Throwable cause)
14
+ {
15
+ super(message, cause);
16
+ }
17
+
18
+ public JsonRecordValidateException(Throwable cause)
19
+ {
20
+ super(cause);
21
+ }
22
+ }
@@ -0,0 +1,39 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ public class BooleanCast
7
+ {
8
+ private BooleanCast() {}
9
+
10
+ private static String buildErrorMessage(String as, boolean value)
11
+ {
12
+ return String.format("cannot cast boolean to %s: \"%s\"", as, value);
13
+ }
14
+
15
+ public static boolean asBoolean(boolean value) throws DataException
16
+ {
17
+ return value;
18
+ }
19
+
20
+ public static long asLong(boolean value) throws DataException
21
+ {
22
+ return value ? 1 : 0;
23
+ }
24
+
25
+ public static double asDouble(boolean value) throws DataException
26
+ {
27
+ throw new DataException(buildErrorMessage("double", value));
28
+ }
29
+
30
+ public static String asString(boolean value) throws DataException
31
+ {
32
+ return value ? "true" : "false";
33
+ }
34
+
35
+ public static Timestamp asTimestamp(boolean value) throws DataException
36
+ {
37
+ throw new DataException(buildErrorMessage("timestamp", value));
38
+ }
39
+ }
@@ -0,0 +1,41 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+
6
+ public class DoubleCast
7
+ {
8
+ private DoubleCast() {}
9
+
10
+ private static String buildErrorMessage(String as, double value)
11
+ {
12
+ return String.format("cannot cast double to %s: \"%s\"", as, value);
13
+ }
14
+
15
+ public static boolean asBoolean(double value) throws DataException
16
+ {
17
+ throw new DataException(buildErrorMessage("boolean", value));
18
+ }
19
+
20
+ public static long asLong(double value) throws DataException
21
+ {
22
+ return (long) value;
23
+ }
24
+
25
+ public static double asDouble(double value) throws DataException
26
+ {
27
+ return value;
28
+ }
29
+
30
+ public static String asString(double value) throws DataException
31
+ {
32
+ return String.valueOf(value);
33
+ }
34
+
35
+ public static Timestamp asTimestamp(double value) throws DataException
36
+ {
37
+ long epochSecond = (long) value;
38
+ long nanoAdjustMent = (long) ((value - epochSecond) * 1000000000);
39
+ return Timestamp.ofEpochSecond(epochSecond, nanoAdjustMent);
40
+ }
41
+ }
@@ -0,0 +1,40 @@
1
+ package org.embulk.parser.joni_regexp.cast;
2
+
3
+ import org.embulk.spi.DataException;
4
+ import org.embulk.spi.time.Timestamp;
5
+ import org.msgpack.value.Value;
6
+
7
+ public class JsonCast
8
+ {
9
+ private JsonCast() {}
10
+
11
+ private static String buildErrorMessage(String as, Value value)
12
+ {
13
+ return String.format("cannot cast Json to %s: \"%s\"", as, value);
14
+ }
15
+
16
+ public static boolean asBoolean(Value value) throws DataException
17
+ {
18
+ throw new DataException(buildErrorMessage("boolean", value));
19
+ }
20
+
21
+ public static long asLong(Value value) throws DataException
22
+ {
23
+ throw new DataException(buildErrorMessage("long", value));
24
+ }
25
+
26
+ public static double asDouble(Value value) throws DataException
27
+ {
28
+ throw new DataException(buildErrorMessage("double", value));
29
+ }
30
+
31
+ public static String asString(Value value) throws DataException
32
+ {
33
+ return value.toString();
34
+ }
35
+
36
+ public static Timestamp asTimestamp(Value value) throws DataException
37
+ {
38
+ throw new DataException(buildErrorMessage("timestamp", value));
39
+ }
40
+ }