embulk-parser-grok 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +113 -0
- data/build.gradle +95 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk/guess/grok.rb +3 -0
- data/lib/embulk/parser/grok.rb +3 -0
- data/pattern/grok-patterns +105 -0
- data/pattern/my-patterns +12 -0
- data/sample/apache.yml +28 -0
- data/sample/apache_stop.yml +29 -0
- data/sample/guess.yml +19 -0
- data/sample/multiline.yml +19 -0
- data/src/main/java/org/embulk/parser/grok/DateParser.java +8 -0
- data/src/main/java/org/embulk/parser/grok/GrokColumnVisitor.java +98 -0
- data/src/main/java/org/embulk/parser/grok/GrokGuessPlugin.java +68 -0
- data/src/main/java/org/embulk/parser/grok/GrokGuesser.java +185 -0
- data/src/main/java/org/embulk/parser/grok/GrokParserPlugin.java +94 -0
- data/src/main/java/org/embulk/parser/grok/GrokRecordIterator.java +71 -0
- data/src/main/java/org/embulk/parser/grok/GrokRecordValidateException.java +11 -0
- data/src/main/java/org/embulk/parser/grok/MultipleLineDecoder.java +66 -0
- data/src/main/java/org/embulk/parser/grok/TimestampParserFactory.java +85 -0
- data/src/test/java/org/embulk/parser/TestGrokGuessPlugin.java +50 -0
- data/src/test/java/org/embulk/parser/TestGrokParserPlugin.java +55 -0
- data/src/test/java/org/embulk/parser/TestGrokPluginBase.java +69 -0
- data/src/test/java/org/embulk/util/EmbulkPluginTester.java +80 -0
- data/src/test/java/org/embulk/util/StreamUtil.java +30 -0
- data/src/test/resources/apache.log +101 -0
- data/src/test/resources/apache.yml +35 -0
- data/src/test/resources/apache_with_error.log +101 -0
- data/src/test/resources/expected_apache.csv +102 -0
- data/src/test/resources/expected_multiline.csv +45 -0
- data/src/test/resources/guess.yml +19 -0
- data/src/test/resources/multiline.log +44 -0
- data/src/test/resources/multiline.yml +25 -0
- metadata +115 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
# Forked from https://github.com/elasticsearch/logstash/tree/v1.4.0/patterns
|
2
|
+
USERNAME [a-zA-Z0-9._-]+
|
3
|
+
USER %{USERNAME:UNWANTED}
|
4
|
+
INT (?:[+-]?(?:[0-9]+))
|
5
|
+
BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
|
6
|
+
NUMBER (?:%{BASE10NUM:UNWANTED})
|
7
|
+
BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
|
8
|
+
BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
|
9
|
+
|
10
|
+
POSINT \b(?:[1-9][0-9]*)\b
|
11
|
+
NONNEGINT \b(?:[0-9]+)\b
|
12
|
+
WORD \b\w+\b
|
13
|
+
WORDOREMPTY \b\w*\b
|
14
|
+
NOTSPACE \S+
|
15
|
+
SPACE \s*
|
16
|
+
DATA .*?
|
17
|
+
GREEDYDATA .*
|
18
|
+
#QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
|
19
|
+
QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``))
|
20
|
+
UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}
|
21
|
+
RESOURCEID \b[A-Za-z0-9_/-]*\b
|
22
|
+
|
23
|
+
# Networking
|
24
|
+
MAC (?:%{CISCOMAC:UNWANTED}|%{WINDOWSMAC:UNWANTED}|%{COMMONMAC:UNWANTED})
|
25
|
+
CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
|
26
|
+
WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
|
27
|
+
COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
|
28
|
+
IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?
|
29
|
+
IPV4 (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
|
30
|
+
IP (?:%{IPV6:UNWANTED}|%{IPV4:UNWANTED})
|
31
|
+
HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
|
32
|
+
HOST %{HOSTNAME:UNWANTED}
|
33
|
+
IPORHOST (?:%{HOSTNAME:UNWANTED}|%{IP:UNWANTED})
|
34
|
+
HOSTPORT (?:%{IPORHOST}:%{POSINT:PORT})
|
35
|
+
|
36
|
+
# paths
|
37
|
+
PATH (?:%{UNIXPATH}|%{WINPATH})
|
38
|
+
UNIXPATH (?>/(?>[\w_%!$@:.,~-]+|\\.)*)+
|
39
|
+
#UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
|
40
|
+
TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+))
|
41
|
+
WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
|
42
|
+
URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
|
43
|
+
URIHOST %{IPORHOST}(?::%{POSINT:port})?
|
44
|
+
# uripath comes loosely from RFC1738, but mostly from what Firefox
|
45
|
+
# doesn't turn into %XX
|
46
|
+
URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+
|
47
|
+
#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
|
48
|
+
URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]*
|
49
|
+
URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
|
50
|
+
URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
|
51
|
+
|
52
|
+
# Months: January, Feb, 3, 03, 12, December
|
53
|
+
MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b
|
54
|
+
MONTHNUM (?:0?[1-9]|1[0-2])
|
55
|
+
MONTHNUM2 (?:0[1-9]|1[0-2])
|
56
|
+
MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])
|
57
|
+
|
58
|
+
# Days: Monday, Tue, Thu, etc...
|
59
|
+
DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)
|
60
|
+
|
61
|
+
# Years?
|
62
|
+
YEAR (?>\d\d){1,2}
|
63
|
+
# Time: HH:MM:SS
|
64
|
+
#TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
|
65
|
+
# I'm still on the fence about using grok to perform the time match,
|
66
|
+
# since it's probably slower.
|
67
|
+
# TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
|
68
|
+
HOUR (?:2[0123]|[01]?[0-9])
|
69
|
+
MINUTE (?:[0-5][0-9])
|
70
|
+
# '60' is a leap second in most time standards and thus is valid.
|
71
|
+
SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?)
|
72
|
+
TIME (?!<[0-9])%{HOUR:UNWANTED}:%{MINUTE:UNWANTED}(?::%{SECOND:UNWANTED})(?![0-9])
|
73
|
+
# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
|
74
|
+
DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
|
75
|
+
DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}
|
76
|
+
ISO8601_TIMEZONE (?:Z|[+-]%{HOUR:UNWANTED}(?::?%{MINUTE:UNWANTED}))
|
77
|
+
ISO8601_SECOND (?:%{SECOND:UNWANTED}|60)
|
78
|
+
TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?%{ISO8601_TIMEZONE}?
|
79
|
+
DATE %{DATE_US}|%{DATE_EU}
|
80
|
+
DATESTAMP %{DATE}[- ]%{TIME}
|
81
|
+
TZ (?:[PMCE][SD]T|UTC)
|
82
|
+
DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
|
83
|
+
DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}
|
84
|
+
DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
|
85
|
+
DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR:UNWANTED}%{MINUTE:UNWANTED}%{SECOND:UNWANTED}
|
86
|
+
|
87
|
+
# Syslog Dates: Month Day HH:MM:SS
|
88
|
+
SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
|
89
|
+
PROG (?:[\w._/%-]+)
|
90
|
+
SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
|
91
|
+
SYSLOGHOST %{IPORHOST}
|
92
|
+
SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}>
|
93
|
+
HTTPDATE %{MONTHDAY:UNWANTED}/%{MONTH:UNWANTED}/%{YEAR:UNWANTED}:%{TIME:UNWANTED} %{INT:UNWANTED}
|
94
|
+
|
95
|
+
# Shortcuts
|
96
|
+
QS %{QUOTEDSTRING:UNWANTED}
|
97
|
+
|
98
|
+
# Log formats
|
99
|
+
SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
|
100
|
+
|
101
|
+
MESSAGESLOG %{SYSLOGBASE} %{DATA}
|
102
|
+
|
103
|
+
COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)
|
104
|
+
COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}
|
105
|
+
|
data/pattern/my-patterns
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
|
2
|
+
LOGLEVEL ([A|a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)
|
3
|
+
|
4
|
+
TIMESTAMP_ISO8601_WITH_SPACE %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?( %{ISO8601_TIMEZONE})?
|
5
|
+
|
6
|
+
MULTILINES (.*+\n)*.*
|
7
|
+
MULTILINESTOTHEEND (.*+\n)*+
|
8
|
+
|
9
|
+
MULTILINELOG_FIRSTLINE %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}$
|
10
|
+
MULTILINELOG %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}(?:\n%{MULTILINESTOTHEEND:stack_trace})?$
|
11
|
+
|
12
|
+
|
data/sample/apache.yml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/apache.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
|
9
|
+
timestamp_parser: ruby
|
10
|
+
grok_pattern: '%{COMBINEDAPACHELOG}'
|
11
|
+
columns:
|
12
|
+
- {name: request, type: string}
|
13
|
+
- {name: agent, type: string}
|
14
|
+
- {name: COMMONAPACHELOG, type: string}
|
15
|
+
- {name: auth, type: string}
|
16
|
+
- {name: ident, type: string}
|
17
|
+
- {name: verb, type: string}
|
18
|
+
- {name: referrer, type: string}
|
19
|
+
- {name: bytes, type: long}
|
20
|
+
- {name: response, type: long}
|
21
|
+
- {name: clientip, type: string}
|
22
|
+
- {name: COMBINEDAPACHELOG, type: string}
|
23
|
+
- {name: httpversion, type: string}
|
24
|
+
- {name: rawrequest, type: string}
|
25
|
+
- {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
|
26
|
+
exec:
|
27
|
+
guess_plugins: [grok]
|
28
|
+
out: {type: stdout}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/apache_with_error.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
|
9
|
+
timestamp_parser: ruby
|
10
|
+
grok_pattern: '%{COMBINEDAPACHELOG}'
|
11
|
+
stop_on_invalid_record: true
|
12
|
+
columns:
|
13
|
+
- {name: request, type: string}
|
14
|
+
- {name: agent, type: string}
|
15
|
+
- {name: COMMONAPACHELOG, type: string}
|
16
|
+
- {name: auth, type: string}
|
17
|
+
- {name: ident, type: string}
|
18
|
+
- {name: verb, type: string}
|
19
|
+
- {name: referrer, type: string}
|
20
|
+
- {name: bytes, type: long}
|
21
|
+
- {name: response, type: long}
|
22
|
+
- {name: clientip, type: string}
|
23
|
+
- {name: COMBINEDAPACHELOG, type: string}
|
24
|
+
- {name: httpversion, type: string}
|
25
|
+
- {name: rawrequest, type: string}
|
26
|
+
- {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
|
27
|
+
exec:
|
28
|
+
guess_plugins: [grok]
|
29
|
+
out: {type: stdout}
|
data/sample/guess.yml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/apache.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files:
|
9
|
+
- pattern/grok-patterns
|
10
|
+
- pattern/my-patterns
|
11
|
+
guess_patterns:
|
12
|
+
- "%{COMBINEDAPACHELOG}"
|
13
|
+
- "%{COMMONAPACHELOG}"
|
14
|
+
timestamp_parser: ruby
|
15
|
+
exec:
|
16
|
+
guess_plugins:
|
17
|
+
- "grok"
|
18
|
+
out:
|
19
|
+
type: stdout
|
@@ -0,0 +1,19 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/multiline.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
|
9
|
+
timestamp_parser: ruby
|
10
|
+
first_line_pattern: '%{MULTILINELOG_FIRSTLINE}'
|
11
|
+
grok_pattern: '%{MULTILINELOG}'
|
12
|
+
columns:
|
13
|
+
- {name: timestamp, format: '%Y-%m-%d %H:%M:%S.%N %z', type: timestamp}
|
14
|
+
- {name: log_level, type: string}
|
15
|
+
- {name: message, type: string}
|
16
|
+
- {name: stack_trace, type: string}
|
17
|
+
exec:
|
18
|
+
guess_plugins: [grok]
|
19
|
+
out: {type: stdout}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
package org.embulk.parser.grok;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableSet;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.ColumnVisitor;
|
6
|
+
import org.embulk.spi.Exec;
|
7
|
+
import org.embulk.spi.PageBuilder;
|
8
|
+
import org.slf4j.Logger;
|
9
|
+
|
10
|
+
import java.util.List;
|
11
|
+
import java.util.Map;
|
12
|
+
|
13
|
+
public class GrokColumnVisitor implements ColumnVisitor {
|
14
|
+
|
15
|
+
private Map<String, Object> record;
|
16
|
+
private PageBuilder pageBuilder;
|
17
|
+
private final List<DateParser> timestampParsers;
|
18
|
+
private final Logger logger = Exec.getLogger(GrokColumnVisitor.class.getName());
|
19
|
+
|
20
|
+
private static final ImmutableSet<String> TRUE_STRINGS =
|
21
|
+
ImmutableSet.of(
|
22
|
+
"true", "True", "TRUE",
|
23
|
+
"yes", "Yes", "YES",
|
24
|
+
"t", "T", "y", "Y",
|
25
|
+
"on", "On", "ON",
|
26
|
+
"1");
|
27
|
+
|
28
|
+
public GrokColumnVisitor(Map<String, Object> record, PageBuilder pageBuilder, List<DateParser> timestampParsers) {
|
29
|
+
this.record = record;
|
30
|
+
this.pageBuilder = pageBuilder;
|
31
|
+
this.timestampParsers = timestampParsers;
|
32
|
+
}
|
33
|
+
|
34
|
+
@Override
|
35
|
+
public void booleanColumn(Column column) {
|
36
|
+
if (record.get(column.getName()) == null) {
|
37
|
+
pageBuilder.setNull(column);
|
38
|
+
} else {
|
39
|
+
pageBuilder.setBoolean(column, TRUE_STRINGS.contains(record.get(column.getName()).toString()));
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
@Override
|
44
|
+
public void longColumn(Column column) {
|
45
|
+
Object longNum = record.get(column.getName());
|
46
|
+
if (longNum == null || longNum.toString().equals("null")) {
|
47
|
+
pageBuilder.setNull(column);
|
48
|
+
} else {
|
49
|
+
try {
|
50
|
+
pageBuilder.setLong(column, Long.parseLong(longNum.toString()));
|
51
|
+
} catch (NumberFormatException e) {
|
52
|
+
logger.error("This column is not Long:" + longNum.toString(), e);
|
53
|
+
throw new GrokRecordValidateException(e);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
@Override
|
59
|
+
public void doubleColumn(Column column) {
|
60
|
+
Object dbl = record.get(column.getName());
|
61
|
+
if (dbl == null) {
|
62
|
+
pageBuilder.setNull(column);
|
63
|
+
} else {
|
64
|
+
try {
|
65
|
+
pageBuilder.setDouble(column, Double.parseDouble(dbl.toString()));
|
66
|
+
} catch (NumberFormatException e) {
|
67
|
+
logger.error("This column is not Double:" + dbl.toString(), e);
|
68
|
+
throw new GrokRecordValidateException(e);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
@Override
|
74
|
+
public void stringColumn(Column column) {
|
75
|
+
if (record.get(column.getName()) == null)
|
76
|
+
pageBuilder.setNull(column);
|
77
|
+
else {
|
78
|
+
pageBuilder.setString(column, record.get(column.getName()).toString());
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
@Override
|
83
|
+
public void timestampColumn(Column column) {
|
84
|
+
Object time = record.get(column.getName());
|
85
|
+
|
86
|
+
if (time == null) {
|
87
|
+
pageBuilder.setNull(column);
|
88
|
+
} else {
|
89
|
+
String timeString = time.toString();
|
90
|
+
try {
|
91
|
+
pageBuilder.setTimestamp(column, timestampParsers.get(column.getIndex()).parse(timeString));
|
92
|
+
} catch (RuntimeException e) {
|
93
|
+
logger.error("TimestampParseError:" + column.getName() + ", timeString:" + timeString + ", getIndex:" + column.getIndex(), e);
|
94
|
+
throw new GrokRecordValidateException(e);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
}
|
98
|
+
}
|
@@ -0,0 +1,68 @@
|
|
1
|
+
package org.embulk.parser.grok;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
5
|
+
import oi.thekraken.grok.api.exception.GrokException;
|
6
|
+
import org.embulk.config.*;
|
7
|
+
import org.embulk.spi.Buffer;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.GuessPlugin;
|
10
|
+
import org.embulk.spi.util.LineDecoder;
|
11
|
+
import org.embulk.spi.util.ListFileInput;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.util.ArrayList;
|
15
|
+
import java.util.List;
|
16
|
+
import java.util.Map;
|
17
|
+
|
18
|
+
public class GrokGuessPlugin implements GuessPlugin {
|
19
|
+
public final Logger logger = Exec.getLogger(GrokGuessPlugin.class.getName());
|
20
|
+
|
21
|
+
public interface PluginTask
|
22
|
+
extends Task, LineDecoder.DecoderTask {
|
23
|
+
|
24
|
+
@Config("grok_pattern_files")
|
25
|
+
List<String> getGrokPatternFiles();
|
26
|
+
|
27
|
+
@Config("guess_patterns")
|
28
|
+
@ConfigDefault("[]")
|
29
|
+
List<String> getGuessPatterns();
|
30
|
+
}
|
31
|
+
|
32
|
+
@Override
|
33
|
+
public ConfigDiff guess(ConfigSource config, Buffer sample) {
|
34
|
+
|
35
|
+
GrokGuessPlugin.PluginTask task = config.getNested("parser").loadConfig(GrokGuessPlugin.PluginTask.class);
|
36
|
+
|
37
|
+
LineDecoder.DecoderTask decoderTask = config.loadConfig(LineDecoder.DecoderTask.class);
|
38
|
+
LineDecoder decoder = new LineDecoder(new ListFileInput(ImmutableList.of(ImmutableList.of((sample)))), decoderTask);
|
39
|
+
|
40
|
+
List<String> sampleLines = new ArrayList<>();
|
41
|
+
while (true) {
|
42
|
+
if (!decoder.nextFile()) {
|
43
|
+
break;
|
44
|
+
}
|
45
|
+
while (true) {
|
46
|
+
String line = decoder.poll();
|
47
|
+
if (line == null) {
|
48
|
+
break;
|
49
|
+
}
|
50
|
+
sampleLines.add(line);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
GrokGuesser guesser = new GrokGuesser(
|
55
|
+
task.getGuessPatterns(),
|
56
|
+
task.getGrokPatternFiles()
|
57
|
+
);
|
58
|
+
try {
|
59
|
+
String pattern = guesser.guessPattern(sampleLines);
|
60
|
+
List<Map<String, Object>> columns = guesser.guessColumns(sampleLines, pattern);
|
61
|
+
return Exec.newConfigDiff().set(
|
62
|
+
"parser", ImmutableMap.of("grok_pattern", pattern, "columns", columns));
|
63
|
+
} catch (GrokException e) {
|
64
|
+
return Exec.newConfigDiff();
|
65
|
+
}
|
66
|
+
|
67
|
+
}
|
68
|
+
}
|
@@ -0,0 +1,185 @@
|
|
1
|
+
package org.embulk.parser.grok;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableMap;
|
4
|
+
import oi.thekraken.grok.api.Grok;
|
5
|
+
import oi.thekraken.grok.api.Match;
|
6
|
+
import oi.thekraken.grok.api.exception.GrokException;
|
7
|
+
|
8
|
+
import java.text.ParseException;
|
9
|
+
import java.text.SimpleDateFormat;
|
10
|
+
import java.util.*;
|
11
|
+
import java.util.stream.Collectors;
|
12
|
+
|
13
|
+
public class GrokGuesser {
|
14
|
+
|
15
|
+
private List<String> guessPatterns;
|
16
|
+
private List<String> patternFiles;
|
17
|
+
|
18
|
+
public GrokGuesser(List<String> guessPatterns, List<String> patternFiles) {
|
19
|
+
this.guessPatterns = guessPatterns;
|
20
|
+
this.patternFiles = patternFiles;
|
21
|
+
}
|
22
|
+
|
23
|
+
public String guessPattern(List<String> sampleLines) throws GrokException {
|
24
|
+
for (String guessPattern : guessPatterns) {
|
25
|
+
Grok grok = new Grok();
|
26
|
+
for (String file : patternFiles) {
|
27
|
+
grok.addPatternFromFile(file);
|
28
|
+
}
|
29
|
+
try {
|
30
|
+
grok.compile(guessPattern);
|
31
|
+
} catch (GrokException e) {
|
32
|
+
continue;
|
33
|
+
}
|
34
|
+
|
35
|
+
boolean allMatch = sampleLines.stream().allMatch(line -> {
|
36
|
+
Match m = grok.match(line);
|
37
|
+
m.captures();
|
38
|
+
return !m.isNull();
|
39
|
+
});
|
40
|
+
if (allMatch) {
|
41
|
+
return guessPattern;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
throw new GrokException("Patterns not matched");
|
46
|
+
}
|
47
|
+
|
48
|
+
public List<Map<String, Object>> guessColumns(List<String> sampleLines, String pattern) throws GrokException {
|
49
|
+
|
50
|
+
Grok grok = new Grok();
|
51
|
+
for (String file : patternFiles) {
|
52
|
+
grok.addPatternFromFile(file);
|
53
|
+
}
|
54
|
+
grok.compile(pattern);
|
55
|
+
|
56
|
+
List<Map<String, Object>> records = sampleLines.stream().map(line -> {
|
57
|
+
Match m = grok.match(line);
|
58
|
+
m.captures();
|
59
|
+
return m.toMap();
|
60
|
+
}).collect(Collectors.toList());
|
61
|
+
|
62
|
+
return guessTypesFromRecords(records);
|
63
|
+
}
|
64
|
+
|
65
|
+
|
66
|
+
private List<Map<String, Object>> guessTypesFromRecords(List<Map<String, Object>> samples) {
|
67
|
+
Map<String, ColumnType> types = new HashMap<>();
|
68
|
+
for (Map<String, Object> record : samples) {
|
69
|
+
for (Map.Entry<String, Object> entry : record.entrySet()) {
|
70
|
+
ColumnType currentType = guessType(entry.getValue());
|
71
|
+
if (types.containsKey(entry.getKey())) {
|
72
|
+
types.put(entry.getKey(), mergeType(currentType, types.get(entry.getKey())));
|
73
|
+
} else {
|
74
|
+
types.put(entry.getKey(), currentType);
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
return types.entrySet().stream().map(entry -> {
|
79
|
+
Map<String, Object> val = new HashMap<>();
|
80
|
+
val.put("name", entry.getKey());
|
81
|
+
val.put("type", entry.getValue().getType());
|
82
|
+
if (entry.getValue().getType().equals("timestamp")) {
|
83
|
+
val.put("format", entry.getValue().getFormat());
|
84
|
+
}
|
85
|
+
return val;
|
86
|
+
}).collect(Collectors.toList());
|
87
|
+
}
|
88
|
+
|
89
|
+
private Map<String, SimpleDateFormat> timestampFormats = ImmutableMap.of(
|
90
|
+
"%d/%b/%Y:%T %z", new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss z", Locale.ENGLISH),
|
91
|
+
"%Y-%m-%d %H:%M:%S", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"),
|
92
|
+
"%Y-%m-%d %H:%M:%S.%N", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"),
|
93
|
+
"%Y-%m-%d %H:%M:%S.%N %z", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z"),
|
94
|
+
"%Y-%m-%dT%H:%M:%S.%N%z", new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
|
95
|
+
);
|
96
|
+
|
97
|
+
private ColumnType guessType(Object value) {
|
98
|
+
|
99
|
+
if (value == null) {
|
100
|
+
return new ColumnType("string");
|
101
|
+
} else if (value instanceof Integer) {
|
102
|
+
return new ColumnType("long");
|
103
|
+
} else if (value instanceof Double) {
|
104
|
+
return new ColumnType("double");
|
105
|
+
} else {
|
106
|
+
Optional<String> dateFormat = timestampFormats.entrySet().stream().filter(e -> {
|
107
|
+
try {
|
108
|
+
return e.getValue().parse(value.toString()) != null;
|
109
|
+
} catch (ParseException e1) {
|
110
|
+
return false;
|
111
|
+
}
|
112
|
+
}).map(Map.Entry::getKey).findFirst();
|
113
|
+
if (dateFormat.isPresent()) {
|
114
|
+
return new ColumnType("timestamp", dateFormat.get());
|
115
|
+
} else {
|
116
|
+
return new ColumnType("string");
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
private ColumnType mergeType(ColumnType t1, ColumnType t2) {
|
122
|
+
if (t1.equals(t2)) {
|
123
|
+
return t1;
|
124
|
+
}
|
125
|
+
|
126
|
+
if (t1.getType().equals("string") || t2.getType().equals("string")) {
|
127
|
+
return new ColumnType("string");
|
128
|
+
}
|
129
|
+
|
130
|
+
if (t1.getType().equals("timestamp") || t2.getType().equals("timestamp")) {
|
131
|
+
return new ColumnType("string");
|
132
|
+
}
|
133
|
+
|
134
|
+
if ((t1.getType().equals("long") && t2.getType().equals("double"))
|
135
|
+
|| (t1.getType().equals("double") && t2.getType().equals("long"))) {
|
136
|
+
return new ColumnType("double");
|
137
|
+
}
|
138
|
+
|
139
|
+
return new ColumnType("string");
|
140
|
+
}
|
141
|
+
|
142
|
+
static class ColumnType {
|
143
|
+
private String type;
|
144
|
+
private String format;
|
145
|
+
|
146
|
+
public ColumnType(String type) {
|
147
|
+
this.type = type;
|
148
|
+
this.format = null;
|
149
|
+
}
|
150
|
+
|
151
|
+
public ColumnType(String type, String format) {
|
152
|
+
this.type = type;
|
153
|
+
this.format = format;
|
154
|
+
}
|
155
|
+
|
156
|
+
public String getType() {
|
157
|
+
return type;
|
158
|
+
}
|
159
|
+
|
160
|
+
public void setType(String type) {
|
161
|
+
this.type = type;
|
162
|
+
}
|
163
|
+
|
164
|
+
public String getFormat() {
|
165
|
+
return format;
|
166
|
+
}
|
167
|
+
|
168
|
+
public void setFormat(String format) {
|
169
|
+
this.format = format;
|
170
|
+
}
|
171
|
+
|
172
|
+
@Override
|
173
|
+
public boolean equals(Object o) {
|
174
|
+
if (this == o) return true;
|
175
|
+
if (o == null || getClass() != o.getClass()) return false;
|
176
|
+
|
177
|
+
ColumnType that = (ColumnType) o;
|
178
|
+
|
179
|
+
if (type != null ? !type.equals(that.type) : that.type != null) return false;
|
180
|
+
return format != null ? format.equals(that.format) : that.format == null;
|
181
|
+
|
182
|
+
}
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|