embulk-parser-grok 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/LICENSE.txt +21 -0
- data/README.md +113 -0
- data/build.gradle +95 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk/guess/grok.rb +3 -0
- data/lib/embulk/parser/grok.rb +3 -0
- data/pattern/grok-patterns +105 -0
- data/pattern/my-patterns +12 -0
- data/sample/apache.yml +28 -0
- data/sample/apache_stop.yml +29 -0
- data/sample/guess.yml +19 -0
- data/sample/multiline.yml +19 -0
- data/src/main/java/org/embulk/parser/grok/DateParser.java +8 -0
- data/src/main/java/org/embulk/parser/grok/GrokColumnVisitor.java +98 -0
- data/src/main/java/org/embulk/parser/grok/GrokGuessPlugin.java +68 -0
- data/src/main/java/org/embulk/parser/grok/GrokGuesser.java +185 -0
- data/src/main/java/org/embulk/parser/grok/GrokParserPlugin.java +94 -0
- data/src/main/java/org/embulk/parser/grok/GrokRecordIterator.java +71 -0
- data/src/main/java/org/embulk/parser/grok/GrokRecordValidateException.java +11 -0
- data/src/main/java/org/embulk/parser/grok/MultipleLineDecoder.java +66 -0
- data/src/main/java/org/embulk/parser/grok/TimestampParserFactory.java +85 -0
- data/src/test/java/org/embulk/parser/TestGrokGuessPlugin.java +50 -0
- data/src/test/java/org/embulk/parser/TestGrokParserPlugin.java +55 -0
- data/src/test/java/org/embulk/parser/TestGrokPluginBase.java +69 -0
- data/src/test/java/org/embulk/util/EmbulkPluginTester.java +80 -0
- data/src/test/java/org/embulk/util/StreamUtil.java +30 -0
- data/src/test/resources/apache.log +101 -0
- data/src/test/resources/apache.yml +35 -0
- data/src/test/resources/apache_with_error.log +101 -0
- data/src/test/resources/expected_apache.csv +102 -0
- data/src/test/resources/expected_multiline.csv +45 -0
- data/src/test/resources/guess.yml +19 -0
- data/src/test/resources/multiline.log +44 -0
- data/src/test/resources/multiline.yml +25 -0
- metadata +115 -0
@@ -0,0 +1,105 @@
|
|
1
|
+
# Forked from https://github.com/elasticsearch/logstash/tree/v1.4.0/patterns
|
2
|
+
USERNAME [a-zA-Z0-9._-]+
|
3
|
+
USER %{USERNAME:UNWANTED}
|
4
|
+
INT (?:[+-]?(?:[0-9]+))
|
5
|
+
BASE10NUM (?<![0-9.+-])(?>[+-]?(?:(?:[0-9]+(?:\.[0-9]+)?)|(?:\.[0-9]+)))
|
6
|
+
NUMBER (?:%{BASE10NUM:UNWANTED})
|
7
|
+
BASE16NUM (?<![0-9A-Fa-f])(?:[+-]?(?:0x)?(?:[0-9A-Fa-f]+))
|
8
|
+
BASE16FLOAT \b(?<![0-9A-Fa-f.])(?:[+-]?(?:0x)?(?:(?:[0-9A-Fa-f]+(?:\.[0-9A-Fa-f]*)?)|(?:\.[0-9A-Fa-f]+)))\b
|
9
|
+
|
10
|
+
POSINT \b(?:[1-9][0-9]*)\b
|
11
|
+
NONNEGINT \b(?:[0-9]+)\b
|
12
|
+
WORD \b\w+\b
|
13
|
+
WORDOREMPTY \b\w*\b
|
14
|
+
NOTSPACE \S+
|
15
|
+
SPACE \s*
|
16
|
+
DATA .*?
|
17
|
+
GREEDYDATA .*
|
18
|
+
#QUOTEDSTRING (?:(?<!\\)(?:"(?:\\.|[^\\"])*"|(?:'(?:\\.|[^\\'])*')|(?:`(?:\\.|[^\\`])*`)))
|
19
|
+
QUOTEDSTRING (?>(?<!\\)(?>"(?>\\.|[^\\"]+)+"|""|(?>'(?>\\.|[^\\']+)+')|''|(?>`(?>\\.|[^\\`]+)+`)|``))
|
20
|
+
UUID [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}
|
21
|
+
RESOURCEID \b[A-Za-z0-9_/-]*\b
|
22
|
+
|
23
|
+
# Networking
|
24
|
+
MAC (?:%{CISCOMAC:UNWANTED}|%{WINDOWSMAC:UNWANTED}|%{COMMONMAC:UNWANTED})
|
25
|
+
CISCOMAC (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
|
26
|
+
WINDOWSMAC (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
|
27
|
+
COMMONMAC (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
|
28
|
+
IPV6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?
|
29
|
+
IPV4 (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
|
30
|
+
IP (?:%{IPV6:UNWANTED}|%{IPV4:UNWANTED})
|
31
|
+
HOSTNAME \b(?:[0-9A-Za-z][0-9A-Za-z-]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-]{0,62}))*(\.?|\b)
|
32
|
+
HOST %{HOSTNAME:UNWANTED}
|
33
|
+
IPORHOST (?:%{HOSTNAME:UNWANTED}|%{IP:UNWANTED})
|
34
|
+
HOSTPORT (?:%{IPORHOST}:%{POSINT:PORT})
|
35
|
+
|
36
|
+
# paths
|
37
|
+
PATH (?:%{UNIXPATH}|%{WINPATH})
|
38
|
+
UNIXPATH (?>/(?>[\w_%!$@:.,~-]+|\\.)*)+
|
39
|
+
#UNIXPATH (?<![\w\/])(?:/[^\/\s?*]*)+
|
40
|
+
TTY (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+))
|
41
|
+
WINPATH (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
|
42
|
+
URIPROTO [A-Za-z]+(\+[A-Za-z+]+)?
|
43
|
+
URIHOST %{IPORHOST}(?::%{POSINT:port})?
|
44
|
+
# uripath comes loosely from RFC1738, but mostly from what Firefox
|
45
|
+
# doesn't turn into %XX
|
46
|
+
URIPATH (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+
|
47
|
+
#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
|
48
|
+
URIPARAM \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]*
|
49
|
+
URIPATHPARAM %{URIPATH}(?:%{URIPARAM})?
|
50
|
+
URI %{URIPROTO}://(?:%{USER}(?::[^@]*)?@)?(?:%{URIHOST})?(?:%{URIPATHPARAM})?
|
51
|
+
|
52
|
+
# Months: January, Feb, 3, 03, 12, December
|
53
|
+
MONTH \b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\b
|
54
|
+
MONTHNUM (?:0?[1-9]|1[0-2])
|
55
|
+
MONTHNUM2 (?:0[1-9]|1[0-2])
|
56
|
+
MONTHDAY (?:(?:0[1-9])|(?:[12][0-9])|(?:3[01])|[1-9])
|
57
|
+
|
58
|
+
# Days: Monday, Tue, Thu, etc...
|
59
|
+
DAY (?:Mon(?:day)?|Tue(?:sday)?|Wed(?:nesday)?|Thu(?:rsday)?|Fri(?:day)?|Sat(?:urday)?|Sun(?:day)?)
|
60
|
+
|
61
|
+
# Years?
|
62
|
+
YEAR (?>\d\d){1,2}
|
63
|
+
# Time: HH:MM:SS
|
64
|
+
#TIME \d{2}:\d{2}(?::\d{2}(?:\.\d+)?)?
|
65
|
+
# I'm still on the fence about using grok to perform the time match,
|
66
|
+
# since it's probably slower.
|
67
|
+
# TIME %{POSINT<24}:%{POSINT<60}(?::%{POSINT<60}(?:\.%{POSINT})?)?
|
68
|
+
HOUR (?:2[0123]|[01]?[0-9])
|
69
|
+
MINUTE (?:[0-5][0-9])
|
70
|
+
# '60' is a leap second in most time standards and thus is valid.
|
71
|
+
SECOND (?:(?:[0-5]?[0-9]|60)(?:[:.,][0-9]+)?)
|
72
|
+
TIME (?!<[0-9])%{HOUR:UNWANTED}:%{MINUTE:UNWANTED}(?::%{SECOND:UNWANTED})(?![0-9])
|
73
|
+
# datestamp is YYYY/MM/DD-HH:MM:SS.UUUU (or something like it)
|
74
|
+
DATE_US %{MONTHNUM}[/-]%{MONTHDAY}[/-]%{YEAR}
|
75
|
+
DATE_EU %{MONTHDAY}[./-]%{MONTHNUM}[./-]%{YEAR}
|
76
|
+
ISO8601_TIMEZONE (?:Z|[+-]%{HOUR:UNWANTED}(?::?%{MINUTE:UNWANTED}))
|
77
|
+
ISO8601_SECOND (?:%{SECOND:UNWANTED}|60)
|
78
|
+
TIMESTAMP_ISO8601 %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?%{ISO8601_TIMEZONE}?
|
79
|
+
DATE %{DATE_US}|%{DATE_EU}
|
80
|
+
DATESTAMP %{DATE}[- ]%{TIME}
|
81
|
+
TZ (?:[PMCE][SD]T|UTC)
|
82
|
+
DATESTAMP_RFC822 %{DAY} %{MONTH} %{MONTHDAY} %{YEAR} %{TIME} %{TZ}
|
83
|
+
DATESTAMP_RFC2822 %{DAY}, %{MONTHDAY} %{MONTH} %{YEAR} %{TIME} %{ISO8601_TIMEZONE}
|
84
|
+
DATESTAMP_OTHER %{DAY} %{MONTH} %{MONTHDAY} %{TIME} %{TZ} %{YEAR}
|
85
|
+
DATESTAMP_EVENTLOG %{YEAR}%{MONTHNUM2}%{MONTHDAY}%{HOUR:UNWANTED}%{MINUTE:UNWANTED}%{SECOND:UNWANTED}
|
86
|
+
|
87
|
+
# Syslog Dates: Month Day HH:MM:SS
|
88
|
+
SYSLOGTIMESTAMP %{MONTH} +%{MONTHDAY} %{TIME}
|
89
|
+
PROG (?:[\w._/%-]+)
|
90
|
+
SYSLOGPROG %{PROG:program}(?:\[%{POSINT:pid}\])?
|
91
|
+
SYSLOGHOST %{IPORHOST}
|
92
|
+
SYSLOGFACILITY <%{NONNEGINT:facility}.%{NONNEGINT:priority}>
|
93
|
+
HTTPDATE %{MONTHDAY:UNWANTED}/%{MONTH:UNWANTED}/%{YEAR:UNWANTED}:%{TIME:UNWANTED} %{INT:UNWANTED}
|
94
|
+
|
95
|
+
# Shortcuts
|
96
|
+
QS %{QUOTEDSTRING:UNWANTED}
|
97
|
+
|
98
|
+
# Log formats
|
99
|
+
SYSLOGBASE %{SYSLOGTIMESTAMP:timestamp} (?:%{SYSLOGFACILITY} )?%{SYSLOGHOST:logsource} %{SYSLOGPROG}:
|
100
|
+
|
101
|
+
MESSAGESLOG %{SYSLOGBASE} %{DATA}
|
102
|
+
|
103
|
+
COMMONAPACHELOG %{IPORHOST:clientip} %{USER:ident} %{USER:auth} \[%{HTTPDATE:timestamp}\] "(?:%{WORD:verb} %{NOTSPACE:request}(?: HTTP/%{NUMBER:httpversion})?|%{DATA:rawrequest})" %{NUMBER:response} (?:%{NUMBER:bytes}|-)
|
104
|
+
COMBINEDAPACHELOG %{COMMONAPACHELOG} %{QS:referrer} %{QS:agent}
|
105
|
+
|
data/pattern/my-patterns
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
|
2
|
+
LOGLEVEL ([A|a]lert|ALERT|[T|t]race|TRACE|[D|d]ebug|DEBUG|[N|n]otice|NOTICE|[I|i]nfo|INFO|[W|w]arn?(?:ing)?|WARN?(?:ING)?|[E|e]rr?(?:or)?|ERR?(?:OR)?|[C|c]rit?(?:ical)?|CRIT?(?:ICAL)?|[F|f]atal|FATAL|[S|s]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)
|
3
|
+
|
4
|
+
TIMESTAMP_ISO8601_WITH_SPACE %{YEAR}-%{MONTHNUM}-%{MONTHDAY}[T ]%{HOUR:UNWANTED}:?%{MINUTE:UNWANTED}(?::?%{SECOND:UNWANTED})?( %{ISO8601_TIMEZONE})?
|
5
|
+
|
6
|
+
MULTILINES (.*+\n)*.*
|
7
|
+
MULTILINESTOTHEEND (.*+\n)*+
|
8
|
+
|
9
|
+
MULTILINELOG_FIRSTLINE %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}$
|
10
|
+
MULTILINELOG %{TIMESTAMP_ISO8601_WITH_SPACE:timestamp} \[%{LOGLEVEL:log_level}\] %{DATA:message}(?:\n%{MULTILINESTOTHEEND:stack_trace})?$
|
11
|
+
|
12
|
+
|
data/sample/apache.yml
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/apache.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
|
9
|
+
timestamp_parser: ruby
|
10
|
+
grok_pattern: '%{COMBINEDAPACHELOG}'
|
11
|
+
columns:
|
12
|
+
- {name: request, type: string}
|
13
|
+
- {name: agent, type: string}
|
14
|
+
- {name: COMMONAPACHELOG, type: string}
|
15
|
+
- {name: auth, type: string}
|
16
|
+
- {name: ident, type: string}
|
17
|
+
- {name: verb, type: string}
|
18
|
+
- {name: referrer, type: string}
|
19
|
+
- {name: bytes, type: long}
|
20
|
+
- {name: response, type: long}
|
21
|
+
- {name: clientip, type: string}
|
22
|
+
- {name: COMBINEDAPACHELOG, type: string}
|
23
|
+
- {name: httpversion, type: string}
|
24
|
+
- {name: rawrequest, type: string}
|
25
|
+
- {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
|
26
|
+
exec:
|
27
|
+
guess_plugins: [grok]
|
28
|
+
out: {type: stdout}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/apache_with_error.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
|
9
|
+
timestamp_parser: ruby
|
10
|
+
grok_pattern: '%{COMBINEDAPACHELOG}'
|
11
|
+
stop_on_invalid_record: true
|
12
|
+
columns:
|
13
|
+
- {name: request, type: string}
|
14
|
+
- {name: agent, type: string}
|
15
|
+
- {name: COMMONAPACHELOG, type: string}
|
16
|
+
- {name: auth, type: string}
|
17
|
+
- {name: ident, type: string}
|
18
|
+
- {name: verb, type: string}
|
19
|
+
- {name: referrer, type: string}
|
20
|
+
- {name: bytes, type: long}
|
21
|
+
- {name: response, type: long}
|
22
|
+
- {name: clientip, type: string}
|
23
|
+
- {name: COMBINEDAPACHELOG, type: string}
|
24
|
+
- {name: httpversion, type: string}
|
25
|
+
- {name: rawrequest, type: string}
|
26
|
+
- {name: timestamp, format: '%d/%b/%Y:%T %z', type: timestamp}
|
27
|
+
exec:
|
28
|
+
guess_plugins: [grok]
|
29
|
+
out: {type: stdout}
|
data/sample/guess.yml
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/apache.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files:
|
9
|
+
- pattern/grok-patterns
|
10
|
+
- pattern/my-patterns
|
11
|
+
guess_patterns:
|
12
|
+
- "%{COMBINEDAPACHELOG}"
|
13
|
+
- "%{COMMONAPACHELOG}"
|
14
|
+
timestamp_parser: ruby
|
15
|
+
exec:
|
16
|
+
guess_plugins:
|
17
|
+
- "grok"
|
18
|
+
out:
|
19
|
+
type: stdout
|
@@ -0,0 +1,19 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: src/test/resources/multiline.log
|
4
|
+
parser:
|
5
|
+
charset: UTF-8
|
6
|
+
newline: CRLF
|
7
|
+
type: grok
|
8
|
+
grok_pattern_files: [pattern/grok-patterns, pattern/my-patterns]
|
9
|
+
timestamp_parser: ruby
|
10
|
+
first_line_pattern: '%{MULTILINELOG_FIRSTLINE}'
|
11
|
+
grok_pattern: '%{MULTILINELOG}'
|
12
|
+
columns:
|
13
|
+
- {name: timestamp, format: '%Y-%m-%d %H:%M:%S.%N %z', type: timestamp}
|
14
|
+
- {name: log_level, type: string}
|
15
|
+
- {name: message, type: string}
|
16
|
+
- {name: stack_trace, type: string}
|
17
|
+
exec:
|
18
|
+
guess_plugins: [grok]
|
19
|
+
out: {type: stdout}
|
@@ -0,0 +1,98 @@
|
|
1
|
+
package org.embulk.parser.grok;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableSet;
|
4
|
+
import org.embulk.spi.Column;
|
5
|
+
import org.embulk.spi.ColumnVisitor;
|
6
|
+
import org.embulk.spi.Exec;
|
7
|
+
import org.embulk.spi.PageBuilder;
|
8
|
+
import org.slf4j.Logger;
|
9
|
+
|
10
|
+
import java.util.List;
|
11
|
+
import java.util.Map;
|
12
|
+
|
13
|
+
public class GrokColumnVisitor implements ColumnVisitor {
|
14
|
+
|
15
|
+
private Map<String, Object> record;
|
16
|
+
private PageBuilder pageBuilder;
|
17
|
+
private final List<DateParser> timestampParsers;
|
18
|
+
private final Logger logger = Exec.getLogger(GrokColumnVisitor.class.getName());
|
19
|
+
|
20
|
+
private static final ImmutableSet<String> TRUE_STRINGS =
|
21
|
+
ImmutableSet.of(
|
22
|
+
"true", "True", "TRUE",
|
23
|
+
"yes", "Yes", "YES",
|
24
|
+
"t", "T", "y", "Y",
|
25
|
+
"on", "On", "ON",
|
26
|
+
"1");
|
27
|
+
|
28
|
+
public GrokColumnVisitor(Map<String, Object> record, PageBuilder pageBuilder, List<DateParser> timestampParsers) {
|
29
|
+
this.record = record;
|
30
|
+
this.pageBuilder = pageBuilder;
|
31
|
+
this.timestampParsers = timestampParsers;
|
32
|
+
}
|
33
|
+
|
34
|
+
@Override
|
35
|
+
public void booleanColumn(Column column) {
|
36
|
+
if (record.get(column.getName()) == null) {
|
37
|
+
pageBuilder.setNull(column);
|
38
|
+
} else {
|
39
|
+
pageBuilder.setBoolean(column, TRUE_STRINGS.contains(record.get(column.getName()).toString()));
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
@Override
|
44
|
+
public void longColumn(Column column) {
|
45
|
+
Object longNum = record.get(column.getName());
|
46
|
+
if (longNum == null || longNum.toString().equals("null")) {
|
47
|
+
pageBuilder.setNull(column);
|
48
|
+
} else {
|
49
|
+
try {
|
50
|
+
pageBuilder.setLong(column, Long.parseLong(longNum.toString()));
|
51
|
+
} catch (NumberFormatException e) {
|
52
|
+
logger.error("This column is not Long:" + longNum.toString(), e);
|
53
|
+
throw new GrokRecordValidateException(e);
|
54
|
+
}
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
@Override
|
59
|
+
public void doubleColumn(Column column) {
|
60
|
+
Object dbl = record.get(column.getName());
|
61
|
+
if (dbl == null) {
|
62
|
+
pageBuilder.setNull(column);
|
63
|
+
} else {
|
64
|
+
try {
|
65
|
+
pageBuilder.setDouble(column, Double.parseDouble(dbl.toString()));
|
66
|
+
} catch (NumberFormatException e) {
|
67
|
+
logger.error("This column is not Double:" + dbl.toString(), e);
|
68
|
+
throw new GrokRecordValidateException(e);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
@Override
|
74
|
+
public void stringColumn(Column column) {
|
75
|
+
if (record.get(column.getName()) == null)
|
76
|
+
pageBuilder.setNull(column);
|
77
|
+
else {
|
78
|
+
pageBuilder.setString(column, record.get(column.getName()).toString());
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
@Override
|
83
|
+
public void timestampColumn(Column column) {
|
84
|
+
Object time = record.get(column.getName());
|
85
|
+
|
86
|
+
if (time == null) {
|
87
|
+
pageBuilder.setNull(column);
|
88
|
+
} else {
|
89
|
+
String timeString = time.toString();
|
90
|
+
try {
|
91
|
+
pageBuilder.setTimestamp(column, timestampParsers.get(column.getIndex()).parse(timeString));
|
92
|
+
} catch (RuntimeException e) {
|
93
|
+
logger.error("TimestampParseError:" + column.getName() + ", timeString:" + timeString + ", getIndex:" + column.getIndex(), e);
|
94
|
+
throw new GrokRecordValidateException(e);
|
95
|
+
}
|
96
|
+
}
|
97
|
+
}
|
98
|
+
}
|
@@ -0,0 +1,68 @@
|
|
1
|
+
package org.embulk.parser.grok;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
5
|
+
import oi.thekraken.grok.api.exception.GrokException;
|
6
|
+
import org.embulk.config.*;
|
7
|
+
import org.embulk.spi.Buffer;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.GuessPlugin;
|
10
|
+
import org.embulk.spi.util.LineDecoder;
|
11
|
+
import org.embulk.spi.util.ListFileInput;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
|
14
|
+
import java.util.ArrayList;
|
15
|
+
import java.util.List;
|
16
|
+
import java.util.Map;
|
17
|
+
|
18
|
+
public class GrokGuessPlugin implements GuessPlugin {
|
19
|
+
public final Logger logger = Exec.getLogger(GrokGuessPlugin.class.getName());
|
20
|
+
|
21
|
+
public interface PluginTask
|
22
|
+
extends Task, LineDecoder.DecoderTask {
|
23
|
+
|
24
|
+
@Config("grok_pattern_files")
|
25
|
+
List<String> getGrokPatternFiles();
|
26
|
+
|
27
|
+
@Config("guess_patterns")
|
28
|
+
@ConfigDefault("[]")
|
29
|
+
List<String> getGuessPatterns();
|
30
|
+
}
|
31
|
+
|
32
|
+
@Override
|
33
|
+
public ConfigDiff guess(ConfigSource config, Buffer sample) {
|
34
|
+
|
35
|
+
GrokGuessPlugin.PluginTask task = config.getNested("parser").loadConfig(GrokGuessPlugin.PluginTask.class);
|
36
|
+
|
37
|
+
LineDecoder.DecoderTask decoderTask = config.loadConfig(LineDecoder.DecoderTask.class);
|
38
|
+
LineDecoder decoder = new LineDecoder(new ListFileInput(ImmutableList.of(ImmutableList.of((sample)))), decoderTask);
|
39
|
+
|
40
|
+
List<String> sampleLines = new ArrayList<>();
|
41
|
+
while (true) {
|
42
|
+
if (!decoder.nextFile()) {
|
43
|
+
break;
|
44
|
+
}
|
45
|
+
while (true) {
|
46
|
+
String line = decoder.poll();
|
47
|
+
if (line == null) {
|
48
|
+
break;
|
49
|
+
}
|
50
|
+
sampleLines.add(line);
|
51
|
+
}
|
52
|
+
}
|
53
|
+
|
54
|
+
GrokGuesser guesser = new GrokGuesser(
|
55
|
+
task.getGuessPatterns(),
|
56
|
+
task.getGrokPatternFiles()
|
57
|
+
);
|
58
|
+
try {
|
59
|
+
String pattern = guesser.guessPattern(sampleLines);
|
60
|
+
List<Map<String, Object>> columns = guesser.guessColumns(sampleLines, pattern);
|
61
|
+
return Exec.newConfigDiff().set(
|
62
|
+
"parser", ImmutableMap.of("grok_pattern", pattern, "columns", columns));
|
63
|
+
} catch (GrokException e) {
|
64
|
+
return Exec.newConfigDiff();
|
65
|
+
}
|
66
|
+
|
67
|
+
}
|
68
|
+
}
|
@@ -0,0 +1,185 @@
|
|
1
|
+
package org.embulk.parser.grok;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableMap;
|
4
|
+
import oi.thekraken.grok.api.Grok;
|
5
|
+
import oi.thekraken.grok.api.Match;
|
6
|
+
import oi.thekraken.grok.api.exception.GrokException;
|
7
|
+
|
8
|
+
import java.text.ParseException;
|
9
|
+
import java.text.SimpleDateFormat;
|
10
|
+
import java.util.*;
|
11
|
+
import java.util.stream.Collectors;
|
12
|
+
|
13
|
+
public class GrokGuesser {
|
14
|
+
|
15
|
+
private List<String> guessPatterns;
|
16
|
+
private List<String> patternFiles;
|
17
|
+
|
18
|
+
public GrokGuesser(List<String> guessPatterns, List<String> patternFiles) {
|
19
|
+
this.guessPatterns = guessPatterns;
|
20
|
+
this.patternFiles = patternFiles;
|
21
|
+
}
|
22
|
+
|
23
|
+
public String guessPattern(List<String> sampleLines) throws GrokException {
|
24
|
+
for (String guessPattern : guessPatterns) {
|
25
|
+
Grok grok = new Grok();
|
26
|
+
for (String file : patternFiles) {
|
27
|
+
grok.addPatternFromFile(file);
|
28
|
+
}
|
29
|
+
try {
|
30
|
+
grok.compile(guessPattern);
|
31
|
+
} catch (GrokException e) {
|
32
|
+
continue;
|
33
|
+
}
|
34
|
+
|
35
|
+
boolean allMatch = sampleLines.stream().allMatch(line -> {
|
36
|
+
Match m = grok.match(line);
|
37
|
+
m.captures();
|
38
|
+
return !m.isNull();
|
39
|
+
});
|
40
|
+
if (allMatch) {
|
41
|
+
return guessPattern;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
throw new GrokException("Patterns not matched");
|
46
|
+
}
|
47
|
+
|
48
|
+
public List<Map<String, Object>> guessColumns(List<String> sampleLines, String pattern) throws GrokException {
|
49
|
+
|
50
|
+
Grok grok = new Grok();
|
51
|
+
for (String file : patternFiles) {
|
52
|
+
grok.addPatternFromFile(file);
|
53
|
+
}
|
54
|
+
grok.compile(pattern);
|
55
|
+
|
56
|
+
List<Map<String, Object>> records = sampleLines.stream().map(line -> {
|
57
|
+
Match m = grok.match(line);
|
58
|
+
m.captures();
|
59
|
+
return m.toMap();
|
60
|
+
}).collect(Collectors.toList());
|
61
|
+
|
62
|
+
return guessTypesFromRecords(records);
|
63
|
+
}
|
64
|
+
|
65
|
+
|
66
|
+
private List<Map<String, Object>> guessTypesFromRecords(List<Map<String, Object>> samples) {
|
67
|
+
Map<String, ColumnType> types = new HashMap<>();
|
68
|
+
for (Map<String, Object> record : samples) {
|
69
|
+
for (Map.Entry<String, Object> entry : record.entrySet()) {
|
70
|
+
ColumnType currentType = guessType(entry.getValue());
|
71
|
+
if (types.containsKey(entry.getKey())) {
|
72
|
+
types.put(entry.getKey(), mergeType(currentType, types.get(entry.getKey())));
|
73
|
+
} else {
|
74
|
+
types.put(entry.getKey(), currentType);
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
return types.entrySet().stream().map(entry -> {
|
79
|
+
Map<String, Object> val = new HashMap<>();
|
80
|
+
val.put("name", entry.getKey());
|
81
|
+
val.put("type", entry.getValue().getType());
|
82
|
+
if (entry.getValue().getType().equals("timestamp")) {
|
83
|
+
val.put("format", entry.getValue().getFormat());
|
84
|
+
}
|
85
|
+
return val;
|
86
|
+
}).collect(Collectors.toList());
|
87
|
+
}
|
88
|
+
|
89
|
+
private Map<String, SimpleDateFormat> timestampFormats = ImmutableMap.of(
|
90
|
+
"%d/%b/%Y:%T %z", new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss z", Locale.ENGLISH),
|
91
|
+
"%Y-%m-%d %H:%M:%S", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"),
|
92
|
+
"%Y-%m-%d %H:%M:%S.%N", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS"),
|
93
|
+
"%Y-%m-%d %H:%M:%S.%N %z", new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS z"),
|
94
|
+
"%Y-%m-%dT%H:%M:%S.%N%z", new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSz")
|
95
|
+
);
|
96
|
+
|
97
|
+
private ColumnType guessType(Object value) {
|
98
|
+
|
99
|
+
if (value == null) {
|
100
|
+
return new ColumnType("string");
|
101
|
+
} else if (value instanceof Integer) {
|
102
|
+
return new ColumnType("long");
|
103
|
+
} else if (value instanceof Double) {
|
104
|
+
return new ColumnType("double");
|
105
|
+
} else {
|
106
|
+
Optional<String> dateFormat = timestampFormats.entrySet().stream().filter(e -> {
|
107
|
+
try {
|
108
|
+
return e.getValue().parse(value.toString()) != null;
|
109
|
+
} catch (ParseException e1) {
|
110
|
+
return false;
|
111
|
+
}
|
112
|
+
}).map(Map.Entry::getKey).findFirst();
|
113
|
+
if (dateFormat.isPresent()) {
|
114
|
+
return new ColumnType("timestamp", dateFormat.get());
|
115
|
+
} else {
|
116
|
+
return new ColumnType("string");
|
117
|
+
}
|
118
|
+
}
|
119
|
+
}
|
120
|
+
|
121
|
+
private ColumnType mergeType(ColumnType t1, ColumnType t2) {
|
122
|
+
if (t1.equals(t2)) {
|
123
|
+
return t1;
|
124
|
+
}
|
125
|
+
|
126
|
+
if (t1.getType().equals("string") || t2.getType().equals("string")) {
|
127
|
+
return new ColumnType("string");
|
128
|
+
}
|
129
|
+
|
130
|
+
if (t1.getType().equals("timestamp") || t2.getType().equals("timestamp")) {
|
131
|
+
return new ColumnType("string");
|
132
|
+
}
|
133
|
+
|
134
|
+
if ((t1.getType().equals("long") && t2.getType().equals("double"))
|
135
|
+
|| (t1.getType().equals("double") && t2.getType().equals("long"))) {
|
136
|
+
return new ColumnType("double");
|
137
|
+
}
|
138
|
+
|
139
|
+
return new ColumnType("string");
|
140
|
+
}
|
141
|
+
|
142
|
+
static class ColumnType {
|
143
|
+
private String type;
|
144
|
+
private String format;
|
145
|
+
|
146
|
+
public ColumnType(String type) {
|
147
|
+
this.type = type;
|
148
|
+
this.format = null;
|
149
|
+
}
|
150
|
+
|
151
|
+
public ColumnType(String type, String format) {
|
152
|
+
this.type = type;
|
153
|
+
this.format = format;
|
154
|
+
}
|
155
|
+
|
156
|
+
public String getType() {
|
157
|
+
return type;
|
158
|
+
}
|
159
|
+
|
160
|
+
public void setType(String type) {
|
161
|
+
this.type = type;
|
162
|
+
}
|
163
|
+
|
164
|
+
public String getFormat() {
|
165
|
+
return format;
|
166
|
+
}
|
167
|
+
|
168
|
+
public void setFormat(String format) {
|
169
|
+
this.format = format;
|
170
|
+
}
|
171
|
+
|
172
|
+
@Override
|
173
|
+
public boolean equals(Object o) {
|
174
|
+
if (this == o) return true;
|
175
|
+
if (o == null || getClass() != o.getClass()) return false;
|
176
|
+
|
177
|
+
ColumnType that = (ColumnType) o;
|
178
|
+
|
179
|
+
if (type != null ? !type.equals(that.type) : that.type != null) return false;
|
180
|
+
return format != null ? format.equals(that.format) : that.format == null;
|
181
|
+
|
182
|
+
}
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|