embulk-parser-grok 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +113 -0
  5. data/build.gradle +95 -0
  6. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  7. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  8. data/gradlew +164 -0
  9. data/gradlew.bat +90 -0
  10. data/lib/embulk/guess/grok.rb +3 -0
  11. data/lib/embulk/parser/grok.rb +3 -0
  12. data/pattern/grok-patterns +105 -0
  13. data/pattern/my-patterns +12 -0
  14. data/sample/apache.yml +28 -0
  15. data/sample/apache_stop.yml +29 -0
  16. data/sample/guess.yml +19 -0
  17. data/sample/multiline.yml +19 -0
  18. data/src/main/java/org/embulk/parser/grok/DateParser.java +8 -0
  19. data/src/main/java/org/embulk/parser/grok/GrokColumnVisitor.java +98 -0
  20. data/src/main/java/org/embulk/parser/grok/GrokGuessPlugin.java +68 -0
  21. data/src/main/java/org/embulk/parser/grok/GrokGuesser.java +185 -0
  22. data/src/main/java/org/embulk/parser/grok/GrokParserPlugin.java +94 -0
  23. data/src/main/java/org/embulk/parser/grok/GrokRecordIterator.java +71 -0
  24. data/src/main/java/org/embulk/parser/grok/GrokRecordValidateException.java +11 -0
  25. data/src/main/java/org/embulk/parser/grok/MultipleLineDecoder.java +66 -0
  26. data/src/main/java/org/embulk/parser/grok/TimestampParserFactory.java +85 -0
  27. data/src/test/java/org/embulk/parser/TestGrokGuessPlugin.java +50 -0
  28. data/src/test/java/org/embulk/parser/TestGrokParserPlugin.java +55 -0
  29. data/src/test/java/org/embulk/parser/TestGrokPluginBase.java +69 -0
  30. data/src/test/java/org/embulk/util/EmbulkPluginTester.java +80 -0
  31. data/src/test/java/org/embulk/util/StreamUtil.java +30 -0
  32. data/src/test/resources/apache.log +101 -0
  33. data/src/test/resources/apache.yml +35 -0
  34. data/src/test/resources/apache_with_error.log +101 -0
  35. data/src/test/resources/expected_apache.csv +102 -0
  36. data/src/test/resources/expected_multiline.csv +45 -0
  37. data/src/test/resources/guess.yml +19 -0
  38. data/src/test/resources/multiline.log +44 -0
  39. data/src/test/resources/multiline.yml +25 -0
  40. metadata +115 -0
@@ -0,0 +1,94 @@
1
+ package org.embulk.parser.grok;
2
+
3
+
4
+ import com.google.common.base.Optional;
5
+ import org.embulk.config.*;
6
+ import org.embulk.spi.*;
7
+ import org.embulk.spi.time.TimestampParser;
8
+ import org.embulk.spi.util.LineDecoder;
9
+ import org.slf4j.Logger;
10
+
11
+ import java.util.List;
12
+ import java.util.Map;
13
+
14
+ public class GrokParserPlugin implements ParserPlugin {
15
+
16
+ public final Logger logger = Exec.getLogger(GrokParserPlugin.class.getName());
17
+
18
+ public interface PluginTask
19
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task {
20
+
21
+ @Config("grok_pattern")
22
+ String getGrokPattern();
23
+
24
+ @Config("first_line_pattern")
25
+ @ConfigDefault("null")
26
+ Optional<String> getFirstLinePattern();
27
+
28
+ @Config("grok_pattern_files")
29
+ List<String> getGrokPatternFiles();
30
+
31
+ @Config("timestamp_parser")
32
+ @ConfigDefault("\"ruby\"")
33
+ String getTimestampParser();
34
+
35
+ @Config("columns")
36
+ SchemaConfig getColumns();
37
+
38
+ @Config("stop_on_invalid_record")
39
+ @ConfigDefault("false")
40
+ boolean getStopOnInvalidRecord();
41
+ }
42
+
43
+ @Override
44
+ public void transaction(ConfigSource config, ParserPlugin.Control control) {
45
+ PluginTask task = config.loadConfig(PluginTask.class);
46
+ Schema schema = task.getColumns().toSchema();
47
+
48
+ control.run(task.dump(), schema);
49
+ }
50
+
51
+ @Override
52
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) {
53
+ GrokParserPlugin.PluginTask task = taskSource.loadTask(GrokParserPlugin.PluginTask.class);
54
+
55
+ LineDecoder decoder;
56
+ if (task.getFirstLinePattern().isPresent()) {
57
+ decoder = new MultipleLineDecoder(input, task);
58
+ } else {
59
+ decoder = new LineDecoder(input, task);
60
+ }
61
+
62
+ try (GrokRecordIterator iterator = new GrokRecordIterator(decoder, task)) {
63
+ final List<DateParser> timestampParsers = TimestampParserFactory.create(task);
64
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
65
+ while (true) {
66
+ if (!iterator.nextFile()) {
67
+ break;
68
+ }
69
+ while (true) {
70
+ if (!iterator.nextLine()) {
71
+ break;
72
+ }
73
+
74
+ try {
75
+ Map<String, Object> record = iterator.getCurrentRecord();
76
+ if (record.keySet().size() != 0) {
77
+ schema.visitColumns(new GrokColumnVisitor(record, pageBuilder, timestampParsers));
78
+ pageBuilder.addRecord();
79
+ }
80
+ } catch (GrokRecordValidateException e) {
81
+ String skippedLine = iterator.getCurrentLine();
82
+ long lineNumber = iterator.getCurrentLineNumber();
83
+ if (task.getStopOnInvalidRecord()) {
84
+ throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
85
+ }
86
+ logger.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
87
+ }
88
+ }
89
+ }
90
+ pageBuilder.finish();
91
+ }
92
+ }
93
+
94
+ }
@@ -0,0 +1,71 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import java.io.Closeable;
4
+ import java.util.Map;
5
+
6
+ import oi.thekraken.grok.api.Grok;
7
+ import oi.thekraken.grok.api.Match;
8
+ import oi.thekraken.grok.api.exception.GrokException;
9
+
10
+ import org.embulk.spi.Exec;
11
+ import org.embulk.spi.util.LineDecoder;
12
+ import org.slf4j.Logger;
13
+
14
+ public class GrokRecordIterator implements Closeable {
15
+
16
+ private LineDecoder decoder;
17
+ private Grok grok = null;
18
+
19
+ private String currentLine = null;
20
+ private long currentLineNumber = 0;
21
+
22
+ private final Logger logger = Exec.getLogger(GrokRecordIterator.class.getName());
23
+
24
+ public GrokRecordIterator(LineDecoder decoder, GrokParserPlugin.PluginTask task) {
25
+ this.decoder = decoder;
26
+ try {
27
+ this.grok = new Grok();
28
+ for (String file : task.getGrokPatternFiles()) {
29
+ this.grok.addPatternFromFile(file);
30
+ }
31
+ this.grok.compile(task.getGrokPattern());
32
+ } catch (GrokException e) {
33
+ throw new RuntimeException(e);
34
+ }
35
+ }
36
+
37
+ public boolean nextFile() {
38
+ currentLine = null;
39
+ currentLineNumber = 0;
40
+ return decoder.nextFile();
41
+ }
42
+
43
+ public boolean nextLine() {
44
+ currentLine = decoder.poll();
45
+ currentLineNumber++;
46
+ return currentLine != null;
47
+ }
48
+
49
+ @Override
50
+ public void close() {
51
+ decoder.close();
52
+ }
53
+
54
+ public Map<String, Object> getCurrentRecord() {
55
+ Match gm = grok.match(currentLine);
56
+ gm.captures();
57
+ if (gm.isNull()) {
58
+ throw new GrokRecordValidateException("Couldn't parse line");
59
+ }
60
+ return gm.toMap();
61
+ }
62
+
63
+ public String getCurrentLine() {
64
+ return currentLine;
65
+ }
66
+
67
+ public long getCurrentLineNumber() {
68
+ return currentLineNumber;
69
+ }
70
+ }
71
+
@@ -0,0 +1,11 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ public class GrokRecordValidateException extends RuntimeException {
4
+ GrokRecordValidateException(String message) {
5
+ super(message);
6
+ }
7
+
8
+ GrokRecordValidateException(Throwable cause) {
9
+ super(cause);
10
+ }
11
+ }
@@ -0,0 +1,66 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.base.Strings;
4
+ import oi.thekraken.grok.api.Grok;
5
+ import oi.thekraken.grok.api.Match;
6
+ import oi.thekraken.grok.api.exception.GrokException;
7
+ import org.embulk.spi.FileInput;
8
+ import org.embulk.spi.util.LineDecoder;
9
+
10
+ public class MultipleLineDecoder extends LineDecoder {
11
+
12
+ private StringBuilder buildLines;
13
+ private Grok grok = null;
14
+
15
+ public MultipleLineDecoder(FileInput in, GrokParserPlugin.PluginTask task) {
16
+ super(in, task);
17
+
18
+ try {
19
+ this.grok = new Grok();
20
+ for (String file : task.getGrokPatternFiles()) {
21
+ this.grok.addPatternFromFile(file);
22
+ }
23
+ this.grok.compile(task.getFirstLinePattern().get());
24
+ } catch (GrokException e) {
25
+ throw new RuntimeException(e);
26
+ }
27
+ this.buildLines = new StringBuilder();
28
+ }
29
+
30
+ @Override
31
+ public boolean nextFile() {
32
+ return super.nextFile();
33
+ }
34
+
35
+ @Override
36
+ public void close() {
37
+ super.close();
38
+ }
39
+
40
+ @Override
41
+ public String poll() {
42
+ String currentLine;
43
+ while ((currentLine = super.poll()) != null) {
44
+ Match gm = grok.match(currentLine);
45
+ gm.captures();
46
+
47
+ if (!gm.isNull()) {
48
+ String fullLog = this.buildLines.toString();
49
+ if (!Strings.isNullOrEmpty(fullLog)) {
50
+ this.buildLines.setLength(0);
51
+ this.buildLines.append(currentLine).append(System.lineSeparator());
52
+ return fullLog;
53
+ }
54
+ }
55
+ this.buildLines.append(currentLine).append(System.lineSeparator());
56
+ }
57
+
58
+ String fullLog = this.buildLines.toString();
59
+ if (!Strings.isNullOrEmpty(fullLog)) {
60
+ this.buildLines.setLength(0);
61
+ return fullLog;
62
+ }
63
+
64
+ return null;
65
+ }
66
+ }
@@ -0,0 +1,85 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.embulk.config.Task;
5
+ import org.embulk.spi.ColumnConfig;
6
+ import org.embulk.spi.time.Timestamp;
7
+ import org.embulk.spi.time.TimestampParser;
8
+ import org.embulk.spi.type.TimestampType;
9
+ import org.embulk.spi.util.Timestamps;
10
+ import org.joda.time.DateTimeZone;
11
+
12
+ import java.text.ParseException;
13
+ import java.text.SimpleDateFormat;
14
+ import java.util.Arrays;
15
+ import java.util.List;
16
+ import java.util.Locale;
17
+ import java.util.Map;
18
+ import java.util.stream.Collectors;
19
+
20
+ public class TimestampParserFactory {
21
+ private static Map<String, String> formatMap = ImmutableMap.<String, String>builder()
22
+ .put("%Y", "yyyy")
23
+ .put("%m", "MM")
24
+ .put("%d", "dd")
25
+ .put("%H", "HH")
26
+ .put("%M", "mm")
27
+ .put("%S", "ss")
28
+ .put("%z", "z")
29
+ .put("%T", "HH:mm:ss")
30
+ .put("%b", "MMM")
31
+ .put("%N", "SSS")
32
+ .put("%6N", "SSS")
33
+ .put("T", "'T'")
34
+ .build();
35
+
36
+ private interface TimestampColumnOption extends Task, TimestampParser.TimestampColumnOption {
37
+ }
38
+
39
+ private static String convertToJavaDateFormat(String rubyFormat) {
40
+ String current = rubyFormat;
41
+
42
+ for (Map.Entry<String, String> entry : formatMap.entrySet()) {
43
+ current = current.replace(entry.getKey(), entry.getValue());
44
+ }
45
+
46
+ return current;
47
+ }
48
+
49
+ public static List<DateParser> create(GrokParserPlugin.PluginTask task) {
50
+ switch (task.getTimestampParser().toLowerCase()) {
51
+ case "ruby":
52
+ TimestampParser[] ps = Timestamps.newTimestampColumnParsers(task, task.getColumns());
53
+ return Arrays.stream(ps)
54
+ .map(parser -> (DateParser) (text) -> parser.parse(text))
55
+ .collect(Collectors.toList());
56
+ case "epoch":
57
+ return task.getColumns().getColumns().stream()
58
+ .map(x -> (DateParser) (text) -> Timestamp.ofEpochMilli(Long.parseLong(text)))
59
+ .collect(Collectors.toList());
60
+ case "sdf":
61
+ case "simpledateformat":
62
+ default:
63
+ SimpleDateFormat[] parsers = new SimpleDateFormat[task.getColumns().getColumnCount()];
64
+ int i = 0;
65
+ for (ColumnConfig column : task.getColumns().getColumns()) {
66
+ if (column.getType() instanceof TimestampType) {
67
+ TimestampColumnOption option = column.getOption().loadConfig(TimestampColumnOption.class);
68
+ String format = convertToJavaDateFormat(option.getFormat().or("yyyy-MM-dd HH:MM:ss.SSS z"));
69
+ SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH);
70
+ sdf.setTimeZone(option.getTimeZone().or(DateTimeZone.UTC).toTimeZone());
71
+ parsers[i] = sdf;
72
+ }
73
+ i++;
74
+ }
75
+ return Arrays.stream(parsers).map(parser ->
76
+ (DateParser) (String date) -> {
77
+ try {
78
+ return Timestamp.ofEpochMilli(parser.parse(date).getTime());
79
+ } catch (ParseException e) {
80
+ throw new GrokRecordValidateException(e);
81
+ }
82
+ }).collect(Collectors.toList());
83
+ }
84
+ }
85
+ }
@@ -0,0 +1,50 @@
1
+ package org.embulk.parser;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.junit.Test;
5
+
6
+ import java.util.Map;
7
+
8
+ import static org.hamcrest.core.Is.is;
9
+ import static org.junit.Assert.assertThat;
10
+
11
+ public class TestGrokGuessPlugin extends TestGrokPluginBase {
12
+
13
+ public void testGuess(String template, String input, String expected) throws Exception {
14
+
15
+ String inputPath = TestGrokParserPlugin.class.getClassLoader().getResource(input).getPath();
16
+
17
+ Map<String, String> params = ImmutableMap.of(
18
+ "__INPUT_PATH__", inputPath,
19
+ "__PROJECT_DIR__", System.getProperty("user.dir")
20
+ );
21
+ String yamlPath = generateConfigYaml(template, params);
22
+ String diff = tester.guess(yamlPath);
23
+
24
+ assertThat(diff, is(expected));
25
+ }
26
+
27
+ @Test
28
+ public void testApacheGuess() throws Exception {
29
+ testGuess("guess.yml", "apache.log", "in:\n" +
30
+ " parser:\n" +
31
+ " grok_pattern: '%{COMBINEDAPACHELOG}'\n" +
32
+ " columns:\n" +
33
+ " - {name: request, type: string}\n" +
34
+ " - {name: agent, type: string}\n" +
35
+ " - {name: COMMONAPACHELOG, type: string}\n" +
36
+ " - {name: auth, type: string}\n" +
37
+ " - {name: ident, type: string}\n" +
38
+ " - {name: verb, type: string}\n" +
39
+ " - {name: referrer, type: string}\n" +
40
+ " - {name: bytes, type: long}\n" +
41
+ " - {name: response, type: long}\n" +
42
+ " - {name: clientip, type: string}\n" +
43
+ " - {name: COMBINEDAPACHELOG, type: string}\n" +
44
+ " - {name: httpversion, type: string}\n" +
45
+ " - {name: rawrequest, type: string}\n" +
46
+ " - {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}\n");
47
+
48
+ }
49
+
50
+ }
@@ -0,0 +1,55 @@
1
+ package org.embulk.parser;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.embulk.util.StreamUtil;
5
+ import org.junit.Test;
6
+
7
+ import java.nio.file.Files;
8
+ import java.nio.file.Path;
9
+ import java.nio.file.Paths;
10
+ import java.util.ArrayList;
11
+ import java.util.List;
12
+ import java.util.Map;
13
+
14
+ import static junit.framework.Assert.assertEquals;
15
+
16
+ public class TestGrokParserPlugin extends TestGrokPluginBase{
17
+
18
+
19
+ public void testRun(String template, String input, String expected) throws Exception {
20
+
21
+ String inputPath = TestGrokParserPlugin.class.getClassLoader().getResource(input).getPath();
22
+ Path outputPath = Paths.get(outputDirectoryPath.toString(), input);
23
+ String expectedPath = TestGrokParserPlugin.class.getClassLoader().getResource(expected).getPath();
24
+
25
+ Map<String, String> params = ImmutableMap.of(
26
+ "__INPUT_PATH__", inputPath,
27
+ "__OUTPUT_PATH__", outputPath.toString(),
28
+ "__PROJECT_DIR__", System.getProperty("user.dir")
29
+ );
30
+ String yamlPath = generateConfigYaml(template, params);
31
+ tester.run(yamlPath);
32
+
33
+ Path outputFilePath = Paths.get(outputDirectoryPath.toString(), input + "000.00.output.csv");
34
+ List<Integer> counter = new ArrayList<>();
35
+ counter.add(0);
36
+ StreamUtil.zip(Files.lines(outputFilePath), Files.lines(Paths.get(expectedPath)))
37
+ .forEach(tuple -> {
38
+ Integer line = counter.get(0);
39
+ line++;
40
+ counter.set(0, line);
41
+ assertEquals(tuple.b, tuple.a);
42
+ });
43
+ }
44
+
45
+ @Test
46
+ public void testApacheLog() throws Exception {
47
+ testRun("apache.yml", "apache.log", "expected_apache.csv");
48
+ }
49
+
50
+ @Test
51
+ public void testMultiLineLog() throws Exception {
52
+ testRun("multiline.yml", "multiline.log", "expected_multiline.csv");
53
+ }
54
+
55
+ }
@@ -0,0 +1,69 @@
1
+ package org.embulk.parser;
2
+
3
+ import org.embulk.parser.grok.GrokGuessPlugin;
4
+ import org.embulk.parser.grok.GrokParserPlugin;
5
+ import org.embulk.spi.GuessPlugin;
6
+ import org.embulk.spi.ParserPlugin;
7
+ import org.embulk.util.EmbulkPluginTester;
8
+ import org.junit.After;
9
+ import org.junit.Before;
10
+
11
+ import java.io.*;
12
+ import java.nio.file.FileVisitResult;
13
+ import java.nio.file.Files;
14
+ import java.nio.file.Path;
15
+ import java.nio.file.SimpleFileVisitor;
16
+ import java.nio.file.attribute.BasicFileAttributes;
17
+ import java.util.Map;
18
+
19
+ public class TestGrokPluginBase {
20
+
21
+ protected String generateConfigYaml(String template, Map<String, String> params) throws IOException {
22
+ File generatedFile = File.createTempFile("generated", "yaml");
23
+
24
+ String templatePath = TestGrokParserPlugin.class.getClassLoader().getResource(template).getPath();
25
+
26
+ try (BufferedReader br = new BufferedReader(new FileReader(new File(templatePath)));
27
+ BufferedWriter writer = new BufferedWriter(new FileWriter(generatedFile))) {
28
+ String line;
29
+ while ((line = br.readLine()) != null) {
30
+
31
+ for (Map.Entry<String, String> entry : params.entrySet()) {
32
+ line = line.replaceAll(entry.getKey(), entry.getValue());
33
+ }
34
+ writer.write(line + "\n");
35
+ }
36
+ }
37
+
38
+ return generatedFile.getAbsolutePath();
39
+ }
40
+
41
+ protected Path outputDirectoryPath;
42
+ protected EmbulkPluginTester tester;
43
+
44
+ @Before
45
+ public void before() throws IOException {
46
+ outputDirectoryPath = Files.createTempDirectory(null);
47
+ tester = new EmbulkPluginTester();
48
+ tester.addPlugin(ParserPlugin.class, "grok", GrokParserPlugin.class);
49
+ tester.addPlugin(GuessPlugin.class, "grok", GrokGuessPlugin.class);
50
+ }
51
+
52
+ @After
53
+ public void after() throws IOException {
54
+ Files.walkFileTree(outputDirectoryPath, new SimpleFileVisitor<Path>() {
55
+ @Override
56
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
57
+ Files.deleteIfExists(dir);
58
+ return super.postVisitDirectory(dir, exc);
59
+ }
60
+
61
+ @Override
62
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
63
+ Files.deleteIfExists(file);
64
+ return super.visitFile(file, attrs);
65
+ }
66
+ });
67
+ }
68
+
69
+ }