embulk-parser-grok 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +113 -0
  5. data/build.gradle +95 -0
  6. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  7. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  8. data/gradlew +164 -0
  9. data/gradlew.bat +90 -0
  10. data/lib/embulk/guess/grok.rb +3 -0
  11. data/lib/embulk/parser/grok.rb +3 -0
  12. data/pattern/grok-patterns +105 -0
  13. data/pattern/my-patterns +12 -0
  14. data/sample/apache.yml +28 -0
  15. data/sample/apache_stop.yml +29 -0
  16. data/sample/guess.yml +19 -0
  17. data/sample/multiline.yml +19 -0
  18. data/src/main/java/org/embulk/parser/grok/DateParser.java +8 -0
  19. data/src/main/java/org/embulk/parser/grok/GrokColumnVisitor.java +98 -0
  20. data/src/main/java/org/embulk/parser/grok/GrokGuessPlugin.java +68 -0
  21. data/src/main/java/org/embulk/parser/grok/GrokGuesser.java +185 -0
  22. data/src/main/java/org/embulk/parser/grok/GrokParserPlugin.java +94 -0
  23. data/src/main/java/org/embulk/parser/grok/GrokRecordIterator.java +71 -0
  24. data/src/main/java/org/embulk/parser/grok/GrokRecordValidateException.java +11 -0
  25. data/src/main/java/org/embulk/parser/grok/MultipleLineDecoder.java +66 -0
  26. data/src/main/java/org/embulk/parser/grok/TimestampParserFactory.java +85 -0
  27. data/src/test/java/org/embulk/parser/TestGrokGuessPlugin.java +50 -0
  28. data/src/test/java/org/embulk/parser/TestGrokParserPlugin.java +55 -0
  29. data/src/test/java/org/embulk/parser/TestGrokPluginBase.java +69 -0
  30. data/src/test/java/org/embulk/util/EmbulkPluginTester.java +80 -0
  31. data/src/test/java/org/embulk/util/StreamUtil.java +30 -0
  32. data/src/test/resources/apache.log +101 -0
  33. data/src/test/resources/apache.yml +35 -0
  34. data/src/test/resources/apache_with_error.log +101 -0
  35. data/src/test/resources/expected_apache.csv +102 -0
  36. data/src/test/resources/expected_multiline.csv +45 -0
  37. data/src/test/resources/guess.yml +19 -0
  38. data/src/test/resources/multiline.log +44 -0
  39. data/src/test/resources/multiline.yml +25 -0
  40. metadata +115 -0
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_guess(
2
+ "grok", "org.embulk.parser.grok.GrokGuessPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "grok", "org.embulk.parser.grok.GrokParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,105 @@
1
+ # Forked from https://github.com/elasticsearch/logstash/tree/v1.4.0/patterns
2
+ USERNAME [a-zA-Z0-9._-]+
3
+ USER %{USERNAME:UNWANTED}
4
+ INT (?:[+-]?(?:[0-9]+))
5
+ BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
6
+ NUMBER (?:%{BASE10NUM:UNWANTED})
7
+ BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
8
+ BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
9
+
10
+ POSINT \b(?:[1-9][0-9]*)\b
11
+ NONNEGINT \b(?:[0-9]+)\b
12
+ WORD \b\w+\b
13
+ WORDOREMPTY \b\w*\b
14
+ NOTSPACE \S+
15
+ SPACE \s*
16
+ DATA .*?
17
+ GREEDYDATA .*
18
+ #QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
19
+ QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``))
20
+ UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}
21
+ RESOURCEID \b[A-Za-z0-9_/-]*\b
22
+
23
+ # Networking
24
+ MAC (?:%{CISCOMAC:UNWANTED}|%{WINDOWSMAC:UNWANTED}|%{COMMONMAC:UNWANTED})
25
+ CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
26
+ WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
27
+ COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
28
+ IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?
29
+ IPV4 (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
30
+ IP (?:%{IPV6:UNWANTED}|%{IPV4:UNWANTED})
31
+ HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
32
+ HOST %{HOSTNAME:UNWANTED}
33
+ IPORHOST (?:%{HOSTNAME:UNWANTED}|%{IP:UNWANTED})
34
+ HOSTPORT (?:%{IPORHOST}:%{POSINT:PORT})
35
+
36
+ # paths
37
+ PATH (?:%{UNIXPATH}|%{WINPATH})
38
+ UNIXPATH (?>/(?>[\w_%!$@:.,~-]+|\\.)*)+
39
+ #UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
40
+ TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+))
41
+ WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
42
+ URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
43
+ URIHOST %{IPORHOST}(?::%{POSINT:port})?
44
+ # uripath comes loosely from RFC1738, but mostly from what Firefox
45
+ # doesn't turn into %XX
46
+ URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+
47
+ #URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
48
+ URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]*
49
+ URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
50
+ URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
51
+
52
+ # Months: January, Feb, 3, 03, 12, December
53
+ MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b
54
+ MONTHNUM (?:0?[1-9]|1[0-2])
55
+ MONTHNUM2 (?:0[1-9]|1[0-2])
56
+ MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])
57
+
58
+ # Days: Monday, Tue, Thu, etc...
59
+ DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)
60
+
61
+ # Years?
62
+ YEAR (?>\d\d){1,2}
63
+ # Time: HH:MM:SS
64
+ #TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
65
+ # I'm still on the fence about using grok to perform the time match,
66
+ # since it's probably slower.
67
+ # TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
68
+ HOUR (?:2[0123]|[01]?[0-9])
69
+ MINUTE (?:[0-5][0-9])
70
+ # '60' is a leap second in most time standards and thus is valid.
71
+ SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?)
72
+ TIME (?!<[0-9])%{HOUR:UNWANTED}:%{MINUTE:UNWANTED}(?::%{SECOND:UNWANTED})(?![0-9])
73
+ # datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
74
+ DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
75
+ DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}
76
+ ISO8601_TIMEZONE (?:Z|[+-]%{HOUR:UNWANTED}(?::?%{MINUTE:UNWANTED}))
77
+ ISO8601_SECOND (?:%{SECOND:UNWANTED}|60)
78
+ TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?%{ISO8601_TIMEZONE}?
79
+ DATE %{DATE_US}|%{DATE_EU}
80
+ DATESTAMP %{DATE}[- ]%{TIME}
81
+ TZ (?:[PMCE][SD]T|UTC)
82
+ DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
83
+ DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}
84
+ DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
85
+ DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR:UNWANTED}%{MINUTE:UNWANTED}%{SECOND:UNWANTED}
86
+
87
+ # Syslog Dates: Month Day HH:MM:SS
88
+ SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
89
+ PROG (?:[\w._/%-]+)
90
+ SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
91
+ SYSLOGHOST %{IPORHOST}
92
+ SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}>
93
+ HTTPDATE %{MONTHDAY:UNWANTED}/%{MONTH:UNWANTED}/%{YEAR:UNWANTED}:%{TIME:UNWANTED} %{INT:UNWANTED}
94
+
95
+ # Shortcuts
96
+ QS %{QUOTEDSTRING:UNWANTED}
97
+
98
+ # Log formats
99
+ SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
100
+
101
+ MESSAGESLOG %{SYSLOGBASE} %{DATA}
102
+
103
+ COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)
104
+ COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}
105
+
@@ -0,0 +1,12 @@
1
+
2
+ LOGLEVEL ([A|a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)
3
+
4
+ TIMESTAMP_ISO8601_WITH_SPACE %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?( %{ISO8601_TIMEZONE})?
5
+
6
+ MULTILINES (.*+\n)*.*
7
+ MULTILINESTOTHEEND (.*+\n)*+
8
+
9
+ MULTILINELOG_FIRSTLINE %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}$
10
+ MULTILINELOG %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}(?:\n%{MULTILINESTOTHEEND:stack_trace})?$
11
+
12
+
data/sample/apache.yml ADDED
@@ -0,0 +1,28 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/apache.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
9
+ timestamp_parser: ruby
10
+ grok_pattern: '%{COMBINEDAPACHELOG}'
11
+ columns:
12
+ - {name: request, type: string}
13
+ - {name: agent, type: string}
14
+ - {name: COMMONAPACHELOG, type: string}
15
+ - {name: auth, type: string}
16
+ - {name: ident, type: string}
17
+ - {name: verb, type: string}
18
+ - {name: referrer, type: string}
19
+ - {name: bytes, type: long}
20
+ - {name: response, type: long}
21
+ - {name: clientip, type: string}
22
+ - {name: COMBINEDAPACHELOG, type: string}
23
+ - {name: httpversion, type: string}
24
+ - {name: rawrequest, type: string}
25
+ - {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
26
+ exec:
27
+ guess_plugins: [grok]
28
+ out: {type: stdout}
@@ -0,0 +1,29 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/apache_with_error.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
9
+ timestamp_parser: ruby
10
+ grok_pattern: '%{COMBINEDAPACHELOG}'
11
+ stop_on_invalid_record: true
12
+ columns:
13
+ - {name: request, type: string}
14
+ - {name: agent, type: string}
15
+ - {name: COMMONAPACHELOG, type: string}
16
+ - {name: auth, type: string}
17
+ - {name: ident, type: string}
18
+ - {name: verb, type: string}
19
+ - {name: referrer, type: string}
20
+ - {name: bytes, type: long}
21
+ - {name: response, type: long}
22
+ - {name: clientip, type: string}
23
+ - {name: COMBINEDAPACHELOG, type: string}
24
+ - {name: httpversion, type: string}
25
+ - {name: rawrequest, type: string}
26
+ - {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
27
+ exec:
28
+ guess_plugins: [grok]
29
+ out: {type: stdout}
data/sample/guess.yml ADDED
@@ -0,0 +1,19 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/apache.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files:
9
+ - pattern/grok-patterns
10
+ - pattern/my-patterns
11
+ guess_patterns:
12
+ - "%{COMBINEDAPACHELOG}"
13
+ - "%{COMMONAPACHELOG}"
14
+ timestamp_parser: ruby
15
+ exec:
16
+ guess_plugins:
17
+ - "grok"
18
+ out:
19
+ type: stdout
@@ -0,0 +1,19 @@
1
+ in:
2
+ type: file
3
+ path_prefix: src/test/resources/multiline.log
4
+ parser:
5
+ charset: UTF-8
6
+ newline: CRLF
7
+ type: grok
8
+ grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
9
+ timestamp_parser: ruby
10
+ first_line_pattern: '%{MULTILINELOG_FIRSTLINE}'
11
+ grok_pattern: '%{MULTILINELOG}'
12
+ columns:
13
+ - {name: timestamp, format: '%Y-%m-%d %H:%M:%S.%N %z', type: timestamp}
14
+ - {name: log_level, type: string}
15
+ - {name: message, type: string}
16
+ - {name: stack_trace, type: string}
17
+ exec:
18
+ guess_plugins: [grok]
19
+ out: {type: stdout}
@@ -0,0 +1,8 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import org.embulk.spi.time.Timestamp;
4
+
5
+ @FunctionalInterface
6
+ public interface DateParser {
7
+ Timestamp parse(String date);
8
+ }
@@ -0,0 +1,98 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableSet;
4
+ import org.embulk.spi.Column;
5
+ import org.embulk.spi.ColumnVisitor;
6
+ import org.embulk.spi.Exec;
7
+ import org.embulk.spi.PageBuilder;
8
+ import org.slf4j.Logger;
9
+
10
+ import java.util.List;
11
+ import java.util.Map;
12
+
13
+ public class GrokColumnVisitor implements ColumnVisitor {
14
+
15
+ private Map<String, Object> record;
16
+ private PageBuilder pageBuilder;
17
+ private final List<DateParser> timestampParsers;
18
+ private final Logger logger = Exec.getLogger(GrokColumnVisitor.class.getName());
19
+
20
+ private static final ImmutableSet<String> TRUE_STRINGS =
21
+ ImmutableSet.of(
22
+ "true", "True", "TRUE",
23
+ "yes", "Yes", "YES",
24
+ "t", "T", "y", "Y",
25
+ "on", "On", "ON",
26
+ "1");
27
+
28
+ public GrokColumnVisitor(Map<String, Object> record, PageBuilder pageBuilder, List<DateParser> timestampParsers) {
29
+ this.record = record;
30
+ this.pageBuilder = pageBuilder;
31
+ this.timestampParsers = timestampParsers;
32
+ }
33
+
34
+ @Override
35
+ public void booleanColumn(Column column) {
36
+ if (record.get(column.getName()) == null) {
37
+ pageBuilder.setNull(column);
38
+ } else {
39
+ pageBuilder.setBoolean(column, TRUE_STRINGS.contains(record.get(column.getName()).toString()));
40
+ }
41
+ }
42
+
43
+ @Override
44
+ public void longColumn(Column column) {
45
+ Object longNum = record.get(column.getName());
46
+ if (longNum == null || longNum.toString().equals("null")) {
47
+ pageBuilder.setNull(column);
48
+ } else {
49
+ try {
50
+ pageBuilder.setLong(column, Long.parseLong(longNum.toString()));
51
+ } catch (NumberFormatException e) {
52
+ logger.error("This column is not Long:" + longNum.toString(), e);
53
+ throw new GrokRecordValidateException(e);
54
+ }
55
+ }
56
+ }
57
+
58
+ @Override
59
+ public void doubleColumn(Column column) {
60
+ Object dbl = record.get(column.getName());
61
+ if (dbl == null) {
62
+ pageBuilder.setNull(column);
63
+ } else {
64
+ try {
65
+ pageBuilder.setDouble(column, Double.parseDouble(dbl.toString()));
66
+ } catch (NumberFormatException e) {
67
+ logger.error("This column is not Double:" + dbl.toString(), e);
68
+ throw new GrokRecordValidateException(e);
69
+ }
70
+ }
71
+ }
72
+
73
+ @Override
74
+ public void stringColumn(Column column) {
75
+ if (record.get(column.getName()) == null)
76
+ pageBuilder.setNull(column);
77
+ else {
78
+ pageBuilder.setString(column, record.get(column.getName()).toString());
79
+ }
80
+ }
81
+
82
+ @Override
83
+ public void timestampColumn(Column column) {
84
+ Object time = record.get(column.getName());
85
+
86
+ if (time == null) {
87
+ pageBuilder.setNull(column);
88
+ } else {
89
+ String timeString = time.toString();
90
+ try {
91
+ pageBuilder.setTimestamp(column, timestampParsers.get(column.getIndex()).parse(timeString));
92
+ } catch (RuntimeException e) {
93
+ logger.error("TimestampParseError:" + column.getName() + ", timeString:" + timeString + ", getIndex:" + column.getIndex(), e);
94
+ throw new GrokRecordValidateException(e);
95
+ }
96
+ }
97
+ }
98
+ }
@@ -0,0 +1,68 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import oi.thekraken.grok.api.exception.GrokException;
6
+ import org.embulk.config.*;
7
+ import org.embulk.spi.Buffer;
8
+ import org.embulk.spi.Exec;
9
+ import org.embulk.spi.GuessPlugin;
10
+ import org.embulk.spi.util.LineDecoder;
11
+ import org.embulk.spi.util.ListFileInput;
12
+ import org.slf4j.Logger;
13
+
14
+ import java.util.ArrayList;
15
+ import java.util.List;
16
+ import java.util.Map;
17
+
18
+ public class GrokGuessPlugin implements GuessPlugin {
19
+ public final Logger logger = Exec.getLogger(GrokGuessPlugin.class.getName());
20
+
21
+ public interface PluginTask
22
+ extends Task, LineDecoder.DecoderTask {
23
+
24
+ @Config("grok_pattern_files")
25
+ List<String> getGrokPatternFiles();
26
+
27
+ @Config("guess_patterns")
28
+ @ConfigDefault("[]")
29
+ List<String> getGuessPatterns();
30
+ }
31
+
32
+ @Override
33
+ public ConfigDiff guess(ConfigSource config, Buffer sample) {
34
+
35
+ GrokGuessPlugin.PluginTask task = config.getNested("parser").loadConfig(GrokGuessPlugin.PluginTask.class);
36
+
37
+ LineDecoder.DecoderTask decoderTask = config.loadConfig(LineDecoder.DecoderTask.class);
38
+ LineDecoder decoder = new LineDecoder(new ListFileInput(ImmutableList.of(ImmutableList.of((sample)))), decoderTask);
39
+
40
+ List<String> sampleLines = new ArrayList<>();
41
+ while (true) {
42
+ if (!decoder.nextFile()) {
43
+ break;
44
+ }
45
+ while (true) {
46
+ String line = decoder.poll();
47
+ if (line == null) {
48
+ break;
49
+ }
50
+ sampleLines.add(line);
51
+ }
52
+ }
53
+
54
+ GrokGuesser guesser = new GrokGuesser(
55
+ task.getGuessPatterns(),
56
+ task.getGrokPatternFiles()
57
+ );
58
+ try {
59
+ String pattern = guesser.guessPattern(sampleLines);
60
+ List<Map<String, Object>> columns = guesser.guessColumns(sampleLines, pattern);
61
+ return Exec.newConfigDiff().set(
62
+ "parser", ImmutableMap.of("grok_pattern", pattern, "columns", columns));
63
+ } catch (GrokException e) {
64
+ return Exec.newConfigDiff();
65
+ }
66
+
67
+ }
68
+ }
@@ -0,0 +1,185 @@
1
+ package org.embulk.parser.grok;
2
+
3
+ import com.google.common.collect.ImmutableMap;
4
+ import oi.thekraken.grok.api.Grok;
5
+ import oi.thekraken.grok.api.Match;
6
+ import oi.thekraken.grok.api.exception.GrokException;
7
+
8
+ import java.text.ParseException;
9
+ import java.text.SimpleDateFormat;
10
+ import java.util.*;
11
+ import java.util.stream.Collectors;
12
+
13
+ public class GrokGuesser {
14
+
15
+ private List<String> guessPatterns;
16
+ private List<String> patternFiles;
17
+
18
+ public GrokGuesser(List<String> guessPatterns, List<String> patternFiles) {
19
+ this.guessPatterns = guessPatterns;
20
+ this.patternFiles = patternFiles;
21
+ }
22
+
23
+ public String guessPattern(List<String> sampleLines) throws GrokException {
24
+ for (String guessPattern : guessPatterns) {
25
+ Grok grok = new Grok();
26
+ for (String file : patternFiles) {
27
+ grok.addPatternFromFile(file);
28
+ }
29
+ try {
30
+ grok.compile(guessPattern);
31
+ } catch (GrokException e) {
32
+ continue;
33
+ }
34
+
35
+ boolean allMatch = sampleLines.stream().allMatch(line -> {
36
+ Match m = grok.match(line);
37
+ m.captures();
38
+ return !m.isNull();
39
+ });
40
+ if (allMatch) {
41
+ return guessPattern;
42
+ }
43
+ }
44
+
45
+ throw new GrokException("Patterns not matched");
46
+ }
47
+
48
+ public List<Map<String, Object>> guessColumns(List<String> sampleLines, String pattern) throws GrokException {
49
+
50
+ Grok grok = new Grok();
51
+ for (String file : patternFiles) {
52
+ grok.addPatternFromFile(file);
53
+ }
54
+ grok.compile(pattern);
55
+
56
+ List<Map<String, Object>> records = sampleLines.stream().map(line -> {
57
+ Match m = grok.match(line);
58
+ m.captures();
59
+ return m.toMap();
60
+ }).collect(Collectors.toList());
61
+
62
+ return guessTypesFromRecords(records);
63
+ }
64
+
65
+
66
+ private List<Map<String, Object>> guessTypesFromRecords(List<Map<String, Object>> samples) {
67
+ Map<String, ColumnType> types = new HashMap<>();
68
+ for (Map<String, Object> record : samples) {
69
+ for (Map.Entry<String, Object> entry : record.entrySet()) {
70
+ ColumnType currentType = guessType(entry.getValue());
71
+ if (types.containsKey(entry.getKey())) {
72
+ types.put(entry.getKey(), mergeType(currentType, types.get(entry.getKey())));
73
+ } else {
74
+ types.put(entry.getKey(), currentType);
75
+ }
76
+ }
77
+ }
78
+ return types.entrySet().stream().map(entry -> {
79
+ Map<String, Object> val = new HashMap<>();
80
+ val.put("name", entry.getKey());
81
+ val.put("type", entry.getValue().getType());
82
+ if (entry.getValue().getType().equals("timestamp")) {
83
+ val.put("format", entry.getValue().getFormat());
84
+ }
85
+ return val;
86
+ }).collect(Collectors.toList());
87
+ }
88
+
89
+ private Map<String, SimpleDateFormat> timestampFormats = ImmutableMap.of(
90
+ "%d/%b/%Y:%T %z", new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss z", Locale.ENGLISH),
91
+ "%Y-%m-%d %H:%M:%S", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"),
92
+ "%Y-%m-%d %H:%M:%S.%N", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"),
93
+ "%Y-%m-%d %H:%M:%S.%N %z", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z"),
94
+ "%Y-%m-%dT%H:%M:%S.%N%z", new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
95
+ );
96
+
97
+ private ColumnType guessType(Object value) {
98
+
99
+ if (value == null) {
100
+ return new ColumnType("string");
101
+ } else if (value instanceof Integer) {
102
+ return new ColumnType("long");
103
+ } else if (value instanceof Double) {
104
+ return new ColumnType("double");
105
+ } else {
106
+ Optional<String> dateFormat = timestampFormats.entrySet().stream().filter(e -> {
107
+ try {
108
+ return e.getValue().parse(value.toString()) != null;
109
+ } catch (ParseException e1) {
110
+ return false;
111
+ }
112
+ }).map(Map.Entry::getKey).findFirst();
113
+ if (dateFormat.isPresent()) {
114
+ return new ColumnType("timestamp", dateFormat.get());
115
+ } else {
116
+ return new ColumnType("string");
117
+ }
118
+ }
119
+ }
120
+
121
+ private ColumnType mergeType(ColumnType t1, ColumnType t2) {
122
+ if (t1.equals(t2)) {
123
+ return t1;
124
+ }
125
+
126
+ if (t1.getType().equals("string") || t2.getType().equals("string")) {
127
+ return new ColumnType("string");
128
+ }
129
+
130
+ if (t1.getType().equals("timestamp") || t2.getType().equals("timestamp")) {
131
+ return new ColumnType("string");
132
+ }
133
+
134
+ if ((t1.getType().equals("long") && t2.getType().equals("double"))
135
+ || (t1.getType().equals("double") && t2.getType().equals("long"))) {
136
+ return new ColumnType("double");
137
+ }
138
+
139
+ return new ColumnType("string");
140
+ }
141
+
142
+ static class ColumnType {
143
+ private String type;
144
+ private String format;
145
+
146
+ public ColumnType(String type) {
147
+ this.type = type;
148
+ this.format = null;
149
+ }
150
+
151
+ public ColumnType(String type, String format) {
152
+ this.type = type;
153
+ this.format = format;
154
+ }
155
+
156
+ public String getType() {
157
+ return type;
158
+ }
159
+
160
+ public void setType(String type) {
161
+ this.type = type;
162
+ }
163
+
164
+ public String getFormat() {
165
+ return format;
166
+ }
167
+
168
+ public void setFormat(String format) {
169
+ this.format = format;
170
+ }
171
+
172
+ @Override
173
+ public boolean equals(Object o) {
174
+ if (this == o) return true;
175
+ if (o == null || getClass() != o.getClass()) return false;
176
+
177
+ ColumnType that = (ColumnType) o;
178
+
179
+ if (type != null ? !type.equals(that.type) : that.type != null) return false;
180
+ return format != null ? format.equals(that.format) : that.format == null;
181
+
182
+ }
183
+ }
184
+ }
185
+