embulk-parser-grok 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +113 -0
  5. data/build.gradle +95 -0
  6. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  7. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  8. data/gradlew +164 -0
  9. data/gradlew.bat +90 -0
  10. data/lib/embulk/guess/grok.rb +3 -0
  11. data/lib/embulk/parser/grok.rb +3 -0
  12. data/pattern/grok-patterns +105 -0
  13. data/pattern/my-patterns +12 -0
  14. data/sample/apache.yml +28 -0
  15. data/sample/apache_stop.yml +29 -0
  16. data/sample/guess.yml +19 -0
  17. data/sample/multiline.yml +19 -0
  18. data/src/main/java/org/embulk/parser/grok/DateParser.java +8 -0
  19. data/src/main/java/org/embulk/parser/grok/GrokColumnVisitor.java +98 -0
  20. data/src/main/java/org/embulk/parser/grok/GrokGuessPlugin.java +68 -0
  21. data/src/main/java/org/embulk/parser/grok/GrokGuesser.java +185 -0
  22. data/src/main/java/org/embulk/parser/grok/GrokParserPlugin.java +94 -0
  23. data/src/main/java/org/embulk/parser/grok/GrokRecordIterator.java +71 -0
  24. data/src/main/java/org/embulk/parser/grok/GrokRecordValidateException.java +11 -0
  25. data/src/main/java/org/embulk/parser/grok/MultipleLineDecoder.java +66 -0
  26. data/src/main/java/org/embulk/parser/grok/TimestampParserFactory.java +85 -0
  27. data/src/test/java/org/embulk/parser/TestGrokGuessPlugin.java +50 -0
  28. data/src/test/java/org/embulk/parser/TestGrokParserPlugin.java +55 -0
  29. data/src/test/java/org/embulk/parser/TestGrokPluginBase.java +69 -0
  30. data/src/test/java/org/embulk/util/EmbulkPluginTester.java +80 -0
  31. data/src/test/java/org/embulk/util/StreamUtil.java +30 -0
  32. data/src/test/resources/apache.log +101 -0
  33. data/src/test/resources/apache.yml +35 -0
  34. data/src/test/resources/apache_with_error.log +101 -0
  35. data/src/test/resources/expected_apache.csv +102 -0
  36. data/src/test/resources/expected_multiline.csv +45 -0
  37. data/src/test/resources/guess.yml +19 -0
  38. data/src/test/resources/multiline.log +44 -0
  39. data/src/test/resources/multiline.yml +25 -0
  40. metadata +115 -0
@@ -0,0 +1,94 @@
1
+ package org.embulk.parser.grok;
2
+
3
+
4
+ import com.google.common.base.Optional;
5
+ import org.embulk.config.*;
6
+ import org.embulk.spi.*;
7
+ import org.embulk.spi.time.TimestampParser;
8
+ import org.embulk.spi.util.LineDecoder;
9
+ import org.slf4j.Logger;
10
+
11
+ import java.util.List;
12
+ import java.util.Map;
13
+
14
+ public class GrokParserPlugin implements ParserPlugin {
15
+
16
+ public final Logger logger = Exec.getLogger(GrokParserPlugin.class.getName());
17
+
18
+ public interface PluginTask
19
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task {
20
+
21
+ @Config("grok_pattern")
22
+ String getGrokPattern();
23
+
24
+ @Config("first_line_pattern")
25
+ @ConfigDefault("null")
26
+ Optional<String> getFirstLinePattern();
27
+
28
+ @Config("grok_pattern_files")
29
+ List<String> getGrokPatternFiles();
30
+
31
+ @Config("timestamp_parser")
32
+ @ConfigDefault("\"ruby\"")
33
+ String getTimestampParser();
34
+
35
+ @Config("columns")
36
+ SchemaConfig getColumns();
37
+
38
+ @Config("stop_on_invalid_record")
39
+ @ConfigDefault("false")
40
+ boolean getStopOnInvalidRecord();
41
+ }
42
+
43
+ @Override
44
+ public void transaction(ConfigSource config, ParserPlugin.Control control) {
45
+ PluginTask task = config.loadConfig(PluginTask.class);
46
+ Schema schema = task.getColumns().toSchema();
47
+
48
+ control.run(task.dump(), schema);
49
+ }
50
+
51
+ @Override
52
+ public void run(TaskSource taskSource, Schema schema, FileInput input, PageOutput output) {
53
+ GrokParserPlugin.PluginTask task = taskSource.loadTask(GrokParserPlugin.PluginTask.class);
54
+
55
+ LineDecoder decoder;
56
+ if (task.getFirstLinePattern().isPresent()) {
57
+ decoder = new MultipleLineDecoder(input, task);
58
+ } else {
59
+ decoder = new LineDecoder(input, task);
60
+ }
61
+
62
+ try (GrokRecordIterator iterator = new GrokRecordIterator(decoder, task)) {
63
+ final List<DateParser> timestampParsers = TimestampParserFactory.create(task);
64
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
65
+ while (true) {
66
+ if (!iterator.nextFile()) {
67
+ break;
68
+ }
69
+ while (true) {
70
+ if (!iterator.nextLine()) {
71
+ break;
72
+ }
73
+
74
+ try {
75
+ Map<String, Object> record = iterator.getCurrentRecord();
76
+ if (record.keySet().size() != 0) {
77
+ schema.visitColumns(new GrokColumnVisitor(record, pageBuilder, timestampParsers));
78
+ pageBuilder.addRecord();
79
+ }
80
+ } catch (GrokRecordValidateException e) {
81
+ String skippedLine = iterator.getCurrentLine();
82
+ long lineNumber = iterator.getCurrentLineNumber();
83
+ if (task.getStopOnInvalidRecord()) {
84
+ throw new DataException(String.format("Invalid record at line %d: %s", lineNumber, skippedLine), e);
85
+ }
86
+ logger.warn(String.format("Skipped line %d (%s): %s", lineNumber, e.getMessage(), skippedLine));
87
+ }
88
+ }
89
+ }
90
+ pageBuilder.finish();
91
+ }
92
+ }
93
+
94
+ }
@@ -0,0 +1,71 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import java.io.Closeable;
4
+ import java.util.Map;
5
+
6
+ import oi.thekraken.grok.api.Grok;
7
+ import oi.thekraken.grok.api.Match;
8
+ import oi.thekraken.grok.api.exception.GrokException;
9
+
10
+ import org.embulk.spi.Exec;
11
+ import org.embulk.spi.util.LineDecoder;
12
+ import org.slf4j.Logger;
13
+
14
+ public class GrokRecordIterator implements Closeable {
15
+
16
+ private LineDecoder decoder;
17
+ private Grok grok = null;
18
+
19
+ private String currentLine = null;
20
+ private long currentLineNumber = 0;
21
+
22
+ private final Logger logger = Exec.getLogger(GrokRecordIterator.class.getName());
23
+
24
+ public GrokRecordIterator(LineDecoder decoder, GrokParserPlugin.PluginTask task) {
25
+ this.decoder = decoder;
26
+ try {
27
+ this.grok = new Grok();
28
+ for (String file : task.getGrokPatternFiles()) {
29
+ this.grok.addPatternFromFile(file);
30
+ }
31
+ this.grok.compile(task.getGrokPattern());
32
+ } catch (GrokException e) {
33
+ throw new RuntimeException(e);
34
+ }
35
+ }
36
+
37
+ public boolean nextFile() {
38
+ currentLine = null;
39
+ currentLineNumber = 0;
40
+ return decoder.nextFile();
41
+ }
42
+
43
+ public boolean nextLine() {
44
+ currentLine = decoder.poll();
45
+ currentLineNumber++;
46
+ return currentLine != null;
47
+ }
48
+
49
+ @Override
50
+ public void close() {
51
+ decoder.close();
52
+ }
53
+
54
+ public Map<String, Object> getCurrentRecord() {
55
+ Match gm = grok.match(currentLine);
56
+ gm.captures();
57
+ if (gm.isNull()) {
58
+ throw new GrokRecordValidateException("Couldn't parse line");
59
+ }
60
+ return gm.toMap();
61
+ }
62
+
63
+ public String getCurrentLine() {
64
+ return currentLine;
65
+ }
66
+
67
+ public long getCurrentLineNumber() {
68
+ return currentLineNumber;
69
+ }
70
+ }
71
+
@@ -0,0 +1,11 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ public class GrokRecordValidateException extends RuntimeException {
4
+ GrokRecordValidateException(String message) {
5
+ super(message);
6
+ }
7
+
8
+ GrokRecordValidateException(Throwable cause) {
9
+ super(cause);
10
+ }
11
+ }
@@ -0,0 +1,66 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.base.Strings;
4
+ import oi.thekraken.grok.api.Grok;
5
+ import oi.thekraken.grok.api.Match;
6
+ import oi.thekraken.grok.api.exception.GrokException;
7
+ import org.embulk.spi.FileInput;
8
+ import org.embulk.spi.util.LineDecoder;
9
+
10
+ public class MultipleLineDecoder extends LineDecoder {
11
+
12
+ private StringBuilder buildLines;
13
+ private Grok grok = null;
14
+
15
+ public MultipleLineDecoder(FileInput in, GrokParserPlugin.PluginTask task) {
16
+ super(in, task);
17
+
18
+ try {
19
+ this.grok = new Grok();
20
+ for (String file : task.getGrokPatternFiles()) {
21
+ this.grok.addPatternFromFile(file);
22
+ }
23
+ this.grok.compile(task.getFirstLinePattern().get());
24
+ } catch (GrokException e) {
25
+ throw new RuntimeException(e);
26
+ }
27
+ this.buildLines = new StringBuilder();
28
+ }
29
+
30
+ @Override
31
+ public boolean nextFile() {
32
+ return super.nextFile();
33
+ }
34
+
35
+ @Override
36
+ public void close() {
37
+ super.close();
38
+ }
39
+
40
+ @Override
41
+ public String poll() {
42
+ String currentLine;
43
+ while ((currentLine = super.poll()) != null) {
44
+ Match gm = grok.match(currentLine);
45
+ gm.captures();
46
+
47
+ if (!gm.isNull()) {
48
+ String fullLog = this.buildLines.toString();
49
+ if (!Strings.isNullOrEmpty(fullLog)) {
50
+ this.buildLines.setLength(0);
51
+ this.buildLines.append(currentLine).append(System.lineSeparator());
52
+ return fullLog;
53
+ }
54
+ }
55
+ this.buildLines.append(currentLine).append(System.lineSeparator());
56
+ }
57
+
58
+ String fullLog = this.buildLines.toString();
59
+ if (!Strings.isNullOrEmpty(fullLog)) {
60
+ this.buildLines.setLength(0);
61
+ return fullLog;
62
+ }
63
+
64
+ return null;
65
+ }
66
+ }
@@ -0,0 +1,85 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.embulk.config.Task;
5
+ import org.embulk.spi.ColumnConfig;
6
+ import org.embulk.spi.time.Timestamp;
7
+ import org.embulk.spi.time.TimestampParser;
8
+ import org.embulk.spi.type.TimestampType;
9
+ import org.embulk.spi.util.Timestamps;
10
+ import org.joda.time.DateTimeZone;
11
+
12
+ import java.text.ParseException;
13
+ import java.text.SimpleDateFormat;
14
+ import java.util.Arrays;
15
+ import java.util.List;
16
+ import java.util.Locale;
17
+ import java.util.Map;
18
+ import java.util.stream.Collectors;
19
+
20
+ public class TimestampParserFactory {
21
+ private static Map<String, String> formatMap = ImmutableMap.<String, String>builder()
22
+ .put("%Y", "yyyy")
23
+ .put("%m", "MM")
24
+ .put("%d", "dd")
25
+ .put("%H", "HH")
26
+ .put("%M", "mm")
27
+ .put("%S", "ss")
28
+ .put("%z", "z")
29
+ .put("%T", "HH:mm:ss")
30
+ .put("%b", "MMM")
31
+ .put("%N", "SSS")
32
+ .put("%6N", "SSS")
33
+ .put("T", "'T'")
34
+ .build();
35
+
36
+ private interface TimestampColumnOption extends Task, TimestampParser.TimestampColumnOption {
37
+ }
38
+
39
+ private static String convertToJavaDateFormat(String rubyFormat) {
40
+ String current = rubyFormat;
41
+
42
+ for (Map.Entry<String, String> entry : formatMap.entrySet()) {
43
+ current = current.replace(entry.getKey(), entry.getValue());
44
+ }
45
+
46
+ return current;
47
+ }
48
+
49
+ public static List<DateParser> create(GrokParserPlugin.PluginTask task) {
50
+ switch (task.getTimestampParser().toLowerCase()) {
51
+ case "ruby":
52
+ TimestampParser[] ps = Timestamps.newTimestampColumnParsers(task, task.getColumns());
53
+ return Arrays.stream(ps)
54
+ .map(parser -> (DateParser) (text) -> parser.parse(text))
55
+ .collect(Collectors.toList());
56
+ case "epoch":
57
+ return task.getColumns().getColumns().stream()
58
+ .map(x -> (DateParser) (text) -> Timestamp.ofEpochMilli(Long.parseLong(text)))
59
+ .collect(Collectors.toList());
60
+ case "sdf":
61
+ case "simpledateformat":
62
+ default:
63
+ SimpleDateFormat[] parsers = new SimpleDateFormat[task.getColumns().getColumnCount()];
64
+ int i = 0;
65
+ for (ColumnConfig column : task.getColumns().getColumns()) {
66
+ if (column.getType() instanceof TimestampType) {
67
+ TimestampColumnOption option = column.getOption().loadConfig(TimestampColumnOption.class);
68
+ String format = convertToJavaDateFormat(option.getFormat().or("yyyy-MM-dd HH:MM:ss.SSS z"));
69
+ SimpleDateFormat sdf = new SimpleDateFormat(format, Locale.ENGLISH);
70
+ sdf.setTimeZone(option.getTimeZone().or(DateTimeZone.UTC).toTimeZone());
71
+ parsers[i] = sdf;
72
+ }
73
+ i++;
74
+ }
75
+ return Arrays.stream(parsers).map(parser ->
76
+ (DateParser) (String date) -> {
77
+ try {
78
+ return Timestamp.ofEpochMilli(parser.parse(date).getTime());
79
+ } catch (ParseException e) {
80
+ throw new GrokRecordValidateException(e);
81
+ }
82
+ }).collect(Collectors.toList());
83
+ }
84
+ }
85
+ }
@@ -0,0 +1,50 @@
1
+ package org.embulk.parser;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.junit.Test;
5
+
6
+ import java.util.Map;
7
+
8
+ import static org.hamcrest.core.Is.is;
9
+ import static org.junit.Assert.assertThat;
10
+
11
+ public class TestGrokGuessPlugin extends TestGrokPluginBase {
12
+
13
+ public void testGuess(String template, String input, String expected) throws Exception {
14
+
15
+ String inputPath = TestGrokParserPlugin.class.getClassLoader().getResource(input).getPath();
16
+
17
+ Map<String, String> params = ImmutableMap.of(
18
+ "__INPUT_PATH__", inputPath,
19
+ "__PROJECT_DIR__", System.getProperty("user.dir")
20
+ );
21
+ String yamlPath = generateConfigYaml(template, params);
22
+ String diff = tester.guess(yamlPath);
23
+
24
+ assertThat(diff, is(expected));
25
+ }
26
+
27
+ @Test
28
+ public void testApacheGuess() throws Exception {
29
+ testGuess("guess.yml", "apache.log", "in:\n" +
30
+ " parser:\n" +
31
+ " grok_pattern: '%{COMBINEDAPACHELOG}'\n" +
32
+ " columns:\n" +
33
+ " - {name: request, type: string}\n" +
34
+ " - {name: agent, type: string}\n" +
35
+ " - {name: COMMONAPACHELOG, type: string}\n" +
36
+ " - {name: auth, type: string}\n" +
37
+ " - {name: ident, type: string}\n" +
38
+ " - {name: verb, type: string}\n" +
39
+ " - {name: referrer, type: string}\n" +
40
+ " - {name: bytes, type: long}\n" +
41
+ " - {name: response, type: long}\n" +
42
+ " - {name: clientip, type: string}\n" +
43
+ " - {name: COMBINEDAPACHELOG, type: string}\n" +
44
+ " - {name: httpversion, type: string}\n" +
45
+ " - {name: rawrequest, type: string}\n" +
46
+ " - {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}\n");
47
+
48
+ }
49
+
50
+ }
@@ -0,0 +1,55 @@
1
+ package org.embulk.parser;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import org.embulk.util.StreamUtil;
5
+ import org.junit.Test;
6
+
7
+ import java.nio.file.Files;
8
+ import java.nio.file.Path;
9
+ import java.nio.file.Paths;
10
+ import java.util.ArrayList;
11
+ import java.util.List;
12
+ import java.util.Map;
13
+
14
+ import static junit.framework.Assert.assertEquals;
15
+
16
+ public class TestGrokParserPlugin extends TestGrokPluginBase{
17
+
18
+
19
+ public void testRun(String template, String input, String expected) throws Exception {
20
+
21
+ String inputPath = TestGrokParserPlugin.class.getClassLoader().getResource(input).getPath();
22
+ Path outputPath = Paths.get(outputDirectoryPath.toString(), input);
23
+ String expectedPath = TestGrokParserPlugin.class.getClassLoader().getResource(expected).getPath();
24
+
25
+ Map<String, String> params = ImmutableMap.of(
26
+ "__INPUT_PATH__", inputPath,
27
+ "__OUTPUT_PATH__", outputPath.toString(),
28
+ "__PROJECT_DIR__", System.getProperty("user.dir")
29
+ );
30
+ String yamlPath = generateConfigYaml(template, params);
31
+ tester.run(yamlPath);
32
+
33
+ Path outputFilePath = Paths.get(outputDirectoryPath.toString(), input + "000.00.output.csv");
34
+ List<Integer> counter = new ArrayList<>();
35
+ counter.add(0);
36
+ StreamUtil.zip(Files.lines(outputFilePath), Files.lines(Paths.get(expectedPath)))
37
+ .forEach(tuple -> {
38
+ Integer line = counter.get(0);
39
+ line++;
40
+ counter.set(0, line);
41
+ assertEquals(tuple.b, tuple.a);
42
+ });
43
+ }
44
+
45
+ @Test
46
+ public void testApacheLog() throws Exception {
47
+ testRun("apache.yml", "apache.log", "expected_apache.csv");
48
+ }
49
+
50
+ @Test
51
+ public void testMultiLineLog() throws Exception {
52
+ testRun("multiline.yml", "multiline.log", "expected_multiline.csv");
53
+ }
54
+
55
+ }
@@ -0,0 +1,69 @@
1
+ package org.embulk.parser;
2
+
3
+ import org.embulk.parser.grok.GrokGuessPlugin;
4
+ import org.embulk.parser.grok.GrokParserPlugin;
5
+ import org.embulk.spi.GuessPlugin;
6
+ import org.embulk.spi.ParserPlugin;
7
+ import org.embulk.util.EmbulkPluginTester;
8
+ import org.junit.After;
9
+ import org.junit.Before;
10
+
11
+ import java.io.*;
12
+ import java.nio.file.FileVisitResult;
13
+ import java.nio.file.Files;
14
+ import java.nio.file.Path;
15
+ import java.nio.file.SimpleFileVisitor;
16
+ import java.nio.file.attribute.BasicFileAttributes;
17
+ import java.util.Map;
18
+
19
+ public class TestGrokPluginBase {
20
+
21
+ protected String generateConfigYaml(String template, Map<String, String> params) throws IOException {
22
+ File generatedFile = File.createTempFile("generated", "yaml");
23
+
24
+ String templatePath = TestGrokParserPlugin.class.getClassLoader().getResource(template).getPath();
25
+
26
+ try (BufferedReader br = new BufferedReader(new FileReader(new File(templatePath)));
27
+ BufferedWriter writer = new BufferedWriter(new FileWriter(generatedFile))) {
28
+ String line;
29
+ while ((line = br.readLine()) != null) {
30
+
31
+ for (Map.Entry<String, String> entry : params.entrySet()) {
32
+ line = line.replaceAll(entry.getKey(), entry.getValue());
33
+ }
34
+ writer.write(line + "\n");
35
+ }
36
+ }
37
+
38
+ return generatedFile.getAbsolutePath();
39
+ }
40
+
41
+ protected Path outputDirectoryPath;
42
+ protected EmbulkPluginTester tester;
43
+
44
+ @Before
45
+ public void before() throws IOException {
46
+ outputDirectoryPath = Files.createTempDirectory(null);
47
+ tester = new EmbulkPluginTester();
48
+ tester.addPlugin(ParserPlugin.class, "grok", GrokParserPlugin.class);
49
+ tester.addPlugin(GuessPlugin.class, "grok", GrokGuessPlugin.class);
50
+ }
51
+
52
+ @After
53
+ public void after() throws IOException {
54
+ Files.walkFileTree(outputDirectoryPath, new SimpleFileVisitor<Path>() {
55
+ @Override
56
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
57
+ Files.deleteIfExists(dir);
58
+ return super.postVisitDirectory(dir, exc);
59
+ }
60
+
61
+ @Override
62
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
63
+ Files.deleteIfExists(file);
64
+ return super.visitFile(file, attrs);
65
+ }
66
+ });
67
+ }
68
+
69
+ }