embulk-parser-grok 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +113 -0
  5. data/build.gradle +95 -0
  6. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  7. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  8. data/gradlew +164 -0
  9. data/gradlew.bat +90 -0
  10. data/lib/embulk/guess/grok.rb +3 -0
  11. data/lib/embulk/parser/grok.rb +3 -0
  12. data/pattern/grok-patterns +105 -0
  13. data/pattern/my-patterns +12 -0
  14. data/sample/apache.yml +28 -0
  15. data/sample/apache_stop.yml +29 -0
  16. data/sample/guess.yml +19 -0
  17. data/sample/multiline.yml +19 -0
  18. data/src/main/java/org/embulk/parser/grok/DateParser.java +8 -0
  19. data/src/main/java/org/embulk/parser/grok/GrokColumnVisitor.java +98 -0
  20. data/src/main/java/org/embulk/parser/grok/GrokGuessPlugin.java +68 -0
  21. data/src/main/java/org/embulk/parser/grok/GrokGuesser.java +185 -0
  22. data/src/main/java/org/embulk/parser/grok/GrokParserPlugin.java +94 -0
  23. data/src/main/java/org/embulk/parser/grok/GrokRecordIterator.java +71 -0
  24. data/src/main/java/org/embulk/parser/grok/GrokRecordValidateException.java +11 -0
  25. data/src/main/java/org/embulk/parser/grok/MultipleLineDecoder.java +66 -0
  26. data/src/main/java/org/embulk/parser/grok/TimestampParserFactory.java +85 -0
  27. data/src/test/java/org/embulk/parser/TestGrokGuessPlugin.java +50 -0
  28. data/src/test/java/org/embulk/parser/TestGrokParserPlugin.java +55 -0
  29. data/src/test/java/org/embulk/parser/TestGrokPluginBase.java +69 -0
  30. data/src/test/java/org/embulk/util/EmbulkPluginTester.java +80 -0
  31. data/src/test/java/org/embulk/util/StreamUtil.java +30 -0
  32. data/src/test/resources/apache.log +101 -0
  33. data/src/test/resources/apache.yml +35 -0
  34. data/src/test/resources/apache_with_error.log +101 -0
  35. data/src/test/resources/expected_apache.csv +102 -0
  36. data/src/test/resources/expected_multiline.csv +45 -0
  37. data/src/test/resources/guess.yml +19 -0
  38. data/src/test/resources/multiline.log +44 -0
  39. data/src/test/resources/multiline.yml +25 -0
  40. metadata +115 -0
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_guess(
2
+ "grok", "org.embulk.parser.grok.GrokGuessPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "grok", "org.embulk.parser.grok.GrokParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,105 @@
1
+ # Forked from https://github.com/elasticsearch/logstash/tree/v1.4.0/patterns
2
+ USERNAME [a-zA-Z0-9._-]+
3
+ USER %{USERNAME:UNWANTED}
4
+ INT (?:[+-]?(?:[0-9]+))
5
+ BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
6
+ NUMBER (?:%{BASE10NUM:UNWANTED})
7
+ BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
8
+ BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
9
+
10
+ POSINT \b(?:[1-9][0-9]*)\b
11
+ NONNEGINT \b(?:[0-9]+)\b
12
+ WORD \b\w+\b
13
+ WORDOREMPTY \b\w*\b
14
+ NOTSPACE \S+
15
+ SPACE \s*
16
+ DATA .*?
17
+ GREEDYDATA .*
18
+ #QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
19
+ QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``))
20
+ UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}
21
+ RESOURCEID \b[A-Za-z0-9_/-]*\b
22
+
23
+ # Networking
24
+ MAC (?:%{CISCOMAC:UNWANTED}|%{WINDOWSMAC:UNWANTED}|%{COMMONMAC:UNWANTED})
25
+ CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
26
+ WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
27
+ COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
28
+ IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?
29
+ IPV4 (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
30
+ IP (?:%{IPV6:UNWANTED}|%{IPV4:UNWANTED})
31
+ HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
32
+ HOST %{HOSTNAME:UNWANTED}
33
+ IPORHOST (?:%{HOSTNAME:UNWANTED}|%{IP:UNWANTED})
34
+ HOSTPORT (?:%{IPORHOST}:%{POSINT:PORT})
35
+
36
+ # paths
37
+ PATH (?:%{UNIXPATH}|%{WINPATH})
38
+ UNIXPATH (?>/(?>[\w_%!$@:.,~-]+|\\.)*)+
39
+ #UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
40
+ TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+))
41
+ WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
42
+ URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
43
+ URIHOST %{IPORHOST}(?::%{POSINT:port})?
44
+ # uripath comes loosely from RFC1738, but mostly from what Firefox
45
+ # doesn't turn into %XX
46
+ URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+
47
+ #URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
48
+ URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]*
49
+ URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
50
+ URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
51
+
52
+ # Months: January, Feb, 3, 03, 12, December
53
+ MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b
54
+ MONTHNUM (?:0?[1-9]|1[0-2])
55
+ MONTHNUM2 (?:0[1-9]|1[0-2])
56
+ MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])
57
+
58
+ # Days: Monday, Tue, Thu, etc...
59
+ DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)
60
+
61
+ # Years?
62
+ YEAR (?>\d\d){1,2}
63
+ # Time: HH:MM:SS
64
+ #TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
65
+ # I'm still on the fence about using grok to perform the time match,
66
+ # since it's probably slower.
67
+ # TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
68
+ HOUR (?:2[0123]|[01]?[0-9])
69
+ MINUTE (?:[0-5][0-9])
70
+ # '60' is a leap second in most time standards and thus is valid.
71
+ SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?)
72
+ TIME (?!<[0-9])%{HOUR:UNWANTED}:%{MINUTE:UNWANTED}(?::%{SECOND:UNWANTED})(?![0-9])
73
+ # datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
74
+ DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
75
+ DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}
76
+ ISO8601_TIMEZONE (?:Z|[+-]%{HOUR:UNWANTED}(?::?%{MINUTE:UNWANTED}))
77
+ ISO8601_SECOND (?:%{SECOND:UNWANTED}|60)
78
+ TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?%{ISO8601_TIMEZONE}?
79
+ DATE %{DATE_US}|%{DATE_EU}
80
+ DATESTAMP %{DATE}[- ]%{TIME}
81
+ TZ (?:[PMCE][SD]T|UTC)
82
+ DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
83
+ DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}
84
+ DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
85
+ DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR:UNWANTED}%{MINUTE:UNWANTED}%{SECOND:UNWANTED}
86
+
87
+ # Syslog Dates: Month Day HH:MM:SS
88
+ SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
89
+ PROG (?:[\w._/%-]+)
90
+ SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
91
+ SYSLOGHOST %{IPORHOST}
92
+ SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}>
93
+ HTTPDATE %{MONTHDAY:UNWANTED}/%{MONTH:UNWANTED}/%{YEAR:UNWANTED}:%{TIME:UNWANTED} %{INT:UNWANTED}
94
+
95
+ # Shortcuts
96
+ QS %{QUOTEDSTRING:UNWANTED}
97
+
98
+ # Log formats
99
+ SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
100
+
101
+ MESSAGESLOG %{SYSLOGBASE} %{DATA}
102
+
103
+ COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)
104
+ COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}
105
+
@@ -0,0 +1,12 @@
1
+
2
+ LOGLEVEL ([A|a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)
3
+
4
+ TIMESTAMP_ISO8601_WITH_SPACE %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?( %{ISO8601_TIMEZONE})?
5
+
6
+ MULTILINES (.*+\n)*.*
7
+ MULTILINESTOTHEEND (.*+\n)*+
8
+
9
+ MULTILINELOG_FIRSTLINE %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}$
10
+ MULTILINELOG %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}(?:\n%{MULTILINESTOTHEEND:stack_trace})?$
11
+
12
+
data/sample/apache.yml ADDED
@@ -0,0 +1,28 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/apache.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
9
+ timestamp_parser: ruby
10
+ grok_pattern: '%{COMBINEDAPACHELOG}'
11
+ columns:
12
+ - {name: request, type: string}
13
+ - {name: agent, type: string}
14
+ - {name: COMMONAPACHELOG, type: string}
15
+ - {name: auth, type: string}
16
+ - {name: ident, type: string}
17
+ - {name: verb, type: string}
18
+ - {name: referrer, type: string}
19
+ - {name: bytes, type: long}
20
+ - {name: response, type: long}
21
+ - {name: clientip, type: string}
22
+ - {name: COMBINEDAPACHELOG, type: string}
23
+ - {name: httpversion, type: string}
24
+ - {name: rawrequest, type: string}
25
+ - {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
26
+ exec:
27
+ guess_plugins: [grok]
28
+ out: {type: stdout}
@@ -0,0 +1,29 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/apache_with_error.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
9
+ timestamp_parser: ruby
10
+ grok_pattern: '%{COMBINEDAPACHELOG}'
11
+ stop_on_invalid_record: true
12
+ columns:
13
+ - {name: request, type: string}
14
+ - {name: agent, type: string}
15
+ - {name: COMMONAPACHELOG, type: string}
16
+ - {name: auth, type: string}
17
+ - {name: ident, type: string}
18
+ - {name: verb, type: string}
19
+ - {name: referrer, type: string}
20
+ - {name: bytes, type: long}
21
+ - {name: response, type: long}
22
+ - {name: clientip, type: string}
23
+ - {name: COMBINEDAPACHELOG, type: string}
24
+ - {name: httpversion, type: string}
25
+ - {name: rawrequest, type: string}
26
+ - {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
27
+ exec:
28
+ guess_plugins: [grok]
29
+ out: {type: stdout}
data/sample/guess.yml ADDED
@@ -0,0 +1,19 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/apache.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files:
9
+ - pattern/grok-patterns
10
+ - pattern/my-patterns
11
+ guess_patterns:
12
+ - "%{COMBINEDAPACHELOG}"
13
+ - "%{COMMONAPACHELOG}"
14
+ timestamp_parser: ruby
15
+ exec:
16
+ guess_plugins:
17
+ - "grok"
18
+ out:
19
+ type: stdout
@@ -0,0 +1,19 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/multiline.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
9
+ timestamp_parser: ruby
10
+ first_line_pattern: '%{MULTILINELOG_FIRSTLINE}'
11
+ grok_pattern: '%{MULTILINELOG}'
12
+ columns:
13
+ - {name: timestamp, format: '%Y-%m-%d %H:%M:%S.%N %z', type: timestamp}
14
+ - {name: log_level, type: string}
15
+ - {name: message, type: string}
16
+ - {name: stack_trace, type: string}
17
+ exec:
18
+ guess_plugins: [grok]
19
+ out: {type: stdout}
@@ -0,0 +1,8 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import org.embulk.spi.time.Timestamp;
4
+
5
+ @FunctionalInterface
6
+ public interface DateParser {
7
+ Timestamp parse(String date);
8
+ }
@@ -0,0 +1,98 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableSet;
4
+ import org.embulk.spi.Column;
5
+ import org.embulk.spi.ColumnVisitor;
6
+ import org.embulk.spi.Exec;
7
+ import org.embulk.spi.PageBuilder;
8
+ import org.slf4j.Logger;
9
+
10
+ import java.util.List;
11
+ import java.util.Map;
12
+
13
+ public class GrokColumnVisitor implements ColumnVisitor {
14
+
15
+ private Map<String, Object> record;
16
+ private PageBuilder pageBuilder;
17
+ private final List<DateParser> timestampParsers;
18
+ private final Logger logger = Exec.getLogger(GrokColumnVisitor.class.getName());
19
+
20
+ private static final ImmutableSet<String> TRUE_STRINGS =
21
+ ImmutableSet.of(
22
+ "true", "True", "TRUE",
23
+ "yes", "Yes", "YES",
24
+ "t", "T", "y", "Y",
25
+ "on", "On", "ON",
26
+ "1");
27
+
28
+ public GrokColumnVisitor(Map<String, Object> record, PageBuilder pageBuilder, List<DateParser> timestampParsers) {
29
+ this.record = record;
30
+ this.pageBuilder = pageBuilder;
31
+ this.timestampParsers = timestampParsers;
32
+ }
33
+
34
+ @Override
35
+ public void booleanColumn(Column column) {
36
+ if (record.get(column.getName()) == null) {
37
+ pageBuilder.setNull(column);
38
+ } else {
39
+ pageBuilder.setBoolean(column, TRUE_STRINGS.contains(record.get(column.getName()).toString()));
40
+ }
41
+ }
42
+
43
+ @Override
44
+ public void longColumn(Column column) {
45
+ Object longNum = record.get(column.getName());
46
+ if (longNum == null || longNum.toString().equals("null")) {
47
+ pageBuilder.setNull(column);
48
+ } else {
49
+ try {
50
+ pageBuilder.setLong(column, Long.parseLong(longNum.toString()));
51
+ } catch (NumberFormatException e) {
52
+ logger.error("This column is not Long:" + longNum.toString(), e);
53
+ throw new GrokRecordValidateException(e);
54
+ }
55
+ }
56
+ }
57
+
58
+ @Override
59
+ public void doubleColumn(Column column) {
60
+ Object dbl = record.get(column.getName());
61
+ if (dbl == null) {
62
+ pageBuilder.setNull(column);
63
+ } else {
64
+ try {
65
+ pageBuilder.setDouble(column, Double.parseDouble(dbl.toString()));
66
+ } catch (NumberFormatException e) {
67
+ logger.error("This column is not Double:" + dbl.toString(), e);
68
+ throw new GrokRecordValidateException(e);
69
+ }
70
+ }
71
+ }
72
+
73
+ @Override
74
+ public void stringColumn(Column column) {
75
+ if (record.get(column.getName()) == null)
76
+ pageBuilder.setNull(column);
77
+ else {
78
+ pageBuilder.setString(column, record.get(column.getName()).toString());
79
+ }
80
+ }
81
+
82
+ @Override
83
+ public void timestampColumn(Column column) {
84
+ Object time = record.get(column.getName());
85
+
86
+ if (time == null) {
87
+ pageBuilder.setNull(column);
88
+ } else {
89
+ String timeString = time.toString();
90
+ try {
91
+ pageBuilder.setTimestamp(column, timestampParsers.get(column.getIndex()).parse(timeString));
92
+ } catch (RuntimeException e) {
93
+ logger.error("TimestampParseError:" + column.getName() + ", timeString:" + timeString + ", getIndex:" + column.getIndex(), e);
94
+ throw new GrokRecordValidateException(e);
95
+ }
96
+ }
97
+ }
98
+ }
@@ -0,0 +1,68 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import oi.thekraken.grok.api.exception.GrokException;
6
+ import org.embulk.config.*;
7
+ import org.embulk.spi.Buffer;
8
+ import org.embulk.spi.Exec;
9
+ import org.embulk.spi.GuessPlugin;
10
+ import org.embulk.spi.util.LineDecoder;
11
+ import org.embulk.spi.util.ListFileInput;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.util.ArrayList;
15
+ import java.util.List;
16
+ import java.util.Map;
17
+
18
+ public class GrokGuessPlugin implements GuessPlugin {
19
+ public final Logger logger = Exec.getLogger(GrokGuessPlugin.class.getName());
20
+
21
+ public interface PluginTask
22
+ extends Task, LineDecoder.DecoderTask {
23
+
24
+ @Config("grok_pattern_files")
25
+ List<String> getGrokPatternFiles();
26
+
27
+ @Config("guess_patterns")
28
+ @ConfigDefault("[]")
29
+ List<String> getGuessPatterns();
30
+ }
31
+
32
+ @Override
33
+ public ConfigDiff guess(ConfigSource config, Buffer sample) {
34
+
35
+ GrokGuessPlugin.PluginTask task = config.getNested("parser").loadConfig(GrokGuessPlugin.PluginTask.class);
36
+
37
+ LineDecoder.DecoderTask decoderTask = config.loadConfig(LineDecoder.DecoderTask.class);
38
+ LineDecoder decoder = new LineDecoder(new ListFileInput(ImmutableList.of(ImmutableList.of((sample)))), decoderTask);
39
+
40
+ List<String> sampleLines = new ArrayList<>();
41
+ while (true) {
42
+ if (!decoder.nextFile()) {
43
+ break;
44
+ }
45
+ while (true) {
46
+ String line = decoder.poll();
47
+ if (line == null) {
48
+ break;
49
+ }
50
+ sampleLines.add(line);
51
+ }
52
+ }
53
+
54
+ GrokGuesser guesser = new GrokGuesser(
55
+ task.getGuessPatterns(),
56
+ task.getGrokPatternFiles()
57
+ );
58
+ try {
59
+ String pattern = guesser.guessPattern(sampleLines);
60
+ List<Map<String, Object>> columns = guesser.guessColumns(sampleLines, pattern);
61
+ return Exec.newConfigDiff().set(
62
+ "parser", ImmutableMap.of("grok_pattern", pattern, "columns", columns));
63
+ } catch (GrokException e) {
64
+ return Exec.newConfigDiff();
65
+ }
66
+
67
+ }
68
+ }
@@ -0,0 +1,185 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import oi.thekraken.grok.api.Grok;
5
+ import oi.thekraken.grok.api.Match;
6
+ import oi.thekraken.grok.api.exception.GrokException;
7
+
8
+ import java.text.ParseException;
9
+ import java.text.SimpleDateFormat;
10
+ import java.util.*;
11
+ import java.util.stream.Collectors;
12
+
13
+ public class GrokGuesser {
14
+
15
+ private List<String> guessPatterns;
16
+ private List<String> patternFiles;
17
+
18
+ public GrokGuesser(List<String> guessPatterns, List<String> patternFiles) {
19
+ this.guessPatterns = guessPatterns;
20
+ this.patternFiles = patternFiles;
21
+ }
22
+
23
+ public String guessPattern(List<String> sampleLines) throws GrokException {
24
+ for (String guessPattern : guessPatterns) {
25
+ Grok grok = new Grok();
26
+ for (String file : patternFiles) {
27
+ grok.addPatternFromFile(file);
28
+ }
29
+ try {
30
+ grok.compile(guessPattern);
31
+ } catch (GrokException e) {
32
+ continue;
33
+ }
34
+
35
+ boolean allMatch = sampleLines.stream().allMatch(line -> {
36
+ Match m = grok.match(line);
37
+ m.captures();
38
+ return !m.isNull();
39
+ });
40
+ if (allMatch) {
41
+ return guessPattern;
42
+ }
43
+ }
44
+
45
+ throw new GrokException("Patterns not matched");
46
+ }
47
+
48
+ public List<Map<String, Object>> guessColumns(List<String> sampleLines, String pattern) throws GrokException {
49
+
50
+ Grok grok = new Grok();
51
+ for (String file : patternFiles) {
52
+ grok.addPatternFromFile(file);
53
+ }
54
+ grok.compile(pattern);
55
+
56
+ List<Map<String, Object>> records = sampleLines.stream().map(line -> {
57
+ Match m = grok.match(line);
58
+ m.captures();
59
+ return m.toMap();
60
+ }).collect(Collectors.toList());
61
+
62
+ return guessTypesFromRecords(records);
63
+ }
64
+
65
+
66
+ private List<Map<String, Object>> guessTypesFromRecords(List<Map<String, Object>> samples) {
67
+ Map<String, ColumnType> types = new HashMap<>();
68
+ for (Map<String, Object> record : samples) {
69
+ for (Map.Entry<String, Object> entry : record.entrySet()) {
70
+ ColumnType currentType = guessType(entry.getValue());
71
+ if (types.containsKey(entry.getKey())) {
72
+ types.put(entry.getKey(), mergeType(currentType, types.get(entry.getKey())));
73
+ } else {
74
+ types.put(entry.getKey(), currentType);
75
+ }
76
+ }
77
+ }
78
+ return types.entrySet().stream().map(entry -> {
79
+ Map<String, Object> val = new HashMap<>();
80
+ val.put("name", entry.getKey());
81
+ val.put("type", entry.getValue().getType());
82
+ if (entry.getValue().getType().equals("timestamp")) {
83
+ val.put("format", entry.getValue().getFormat());
84
+ }
85
+ return val;
86
+ }).collect(Collectors.toList());
87
+ }
88
+
89
+ private Map<String, SimpleDateFormat> timestampFormats = ImmutableMap.of(
90
+ "%d/%b/%Y:%T %z", new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss z", Locale.ENGLISH),
91
+ "%Y-%m-%d %H:%M:%S", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"),
92
+ "%Y-%m-%d %H:%M:%S.%N", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"),
93
+ "%Y-%m-%d %H:%M:%S.%N %z", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z"),
94
+ "%Y-%m-%dT%H:%M:%S.%N%z", new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
95
+ );
96
+
97
+ private ColumnType guessType(Object value) {
98
+
99
+ if (value == null) {
100
+ return new ColumnType("string");
101
+ } else if (value instanceof Integer) {
102
+ return new ColumnType("long");
103
+ } else if (value instanceof Double) {
104
+ return new ColumnType("double");
105
+ } else {
106
+ Optional<String> dateFormat = timestampFormats.entrySet().stream().filter(e -> {
107
+ try {
108
+ return e.getValue().parse(value.toString()) != null;
109
+ } catch (ParseException e1) {
110
+ return false;
111
+ }
112
+ }).map(Map.Entry::getKey).findFirst();
113
+ if (dateFormat.isPresent()) {
114
+ return new ColumnType("timestamp", dateFormat.get());
115
+ } else {
116
+ return new ColumnType("string");
117
+ }
118
+ }
119
+ }
120
+
121
+ private ColumnType mergeType(ColumnType t1, ColumnType t2) {
122
+ if (t1.equals(t2)) {
123
+ return t1;
124
+ }
125
+
126
+ if (t1.getType().equals("string") || t2.getType().equals("string")) {
127
+ return new ColumnType("string");
128
+ }
129
+
130
+ if (t1.getType().equals("timestamp") || t2.getType().equals("timestamp")) {
131
+ return new ColumnType("string");
132
+ }
133
+
134
+ if ((t1.getType().equals("long") && t2.getType().equals("double"))
135
+ || (t1.getType().equals("double") && t2.getType().equals("long"))) {
136
+ return new ColumnType("double");
137
+ }
138
+
139
+ return new ColumnType("string");
140
+ }
141
+
142
+ static class ColumnType {
143
+ private String type;
144
+ private String format;
145
+
146
+ public ColumnType(String type) {
147
+ this.type = type;
148
+ this.format = null;
149
+ }
150
+
151
+ public ColumnType(String type, String format) {
152
+ this.type = type;
153
+ this.format = format;
154
+ }
155
+
156
+ public String getType() {
157
+ return type;
158
+ }
159
+
160
+ public void setType(String type) {
161
+ this.type = type;
162
+ }
163
+
164
+ public String getFormat() {
165
+ return format;
166
+ }
167
+
168
+ public void setFormat(String format) {
169
+ this.format = format;
170
+ }
171
+
172
+ @Override
173
+ public boolean equals(Object o) {
174
+ if (this == o) return true;
175
+ if (o == null || getClass() != o.getClass()) return false;
176
+
177
+ ColumnType that = (ColumnType) o;
178
+
179
+ if (type != null ? !type.equals(that.type) : that.type != null) return false;
180
+ return format != null ? format.equals(that.format) : that.format == null;
181
+
182
+ }
183
+ }
184
+ }
185
+