embulk-parser-apache-custom-log 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/CHANGES.md +9 -0
- data/LICENSE.txt +21 -0
- data/README.md +76 -0
- data/build.gradle +74 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk/guess/apache-custom-log.rb +61 -0
- data/lib/embulk/parser/apache-custom-log.rb +3 -0
- data/src/main/java/org/embulk/parser/ApacheCustomLogParserPlugin.java +109 -0
- data/src/main/java/org/embulk/parser/apache/log/LogElement.java +41 -0
- data/src/main/java/org/embulk/parser/apache/log/LogElementFactory.java +6 -0
- data/src/main/java/org/embulk/parser/apache/log/LogFormats.java +152 -0
- data/src/main/java/org/embulk/parser/apache/log/LongLogElement.java +29 -0
- data/src/main/java/org/embulk/parser/apache/log/LongLogElementFactory.java +30 -0
- data/src/main/java/org/embulk/parser/apache/log/Patterns.java +23 -0
- data/src/main/java/org/embulk/parser/apache/log/Replacement.java +27 -0
- data/src/main/java/org/embulk/parser/apache/log/StringLogElement.java +33 -0
- data/src/main/java/org/embulk/parser/apache/log/StringLogElementFactory.java +29 -0
- data/src/main/java/org/embulk/parser/apache/log/TimestampLogElement.java +42 -0
- data/src/main/java/org/embulk/parser/apache/log/TimestampLogElementFactory.java +24 -0
- data/src/test/java/org/embulk/parser/TestApacheLogParserPlugin.java +162 -0
- data/src/test/java/org/embulk/parser/apache/log/LogFormatsTest.java +39 -0
- data/src/test/java/org/embulk/parser/apache/log/PatternsTest.java +120 -0
- data/src/test/java/org/embulk/parser/apache/log/StringLogElementFactoryTest.java +91 -0
- data/src/test/java/org/embulk/parser/apache/log/StringLogElementTest.java +51 -0
- data/src/test/java/org/embulk/tester/DummyConfigSource.java +86 -0
- data/src/test/java/org/embulk/tester/EmbulkPluginTester.java +52 -0
- data/src/test/java/org/embulk/tester/TestExtension.java +52 -0
- data/src/test/resources/META-INF/services/org.embulk.spi.Extension +1 -0
- data/src/test/resources/data/access_log_2_combined +1 -0
- data/src/test/resources/data/access_log_combined +2 -0
- data/src/test/resources/data/access_log_common +1 -0
- data/src/test/resources/resource.txt +0 -0
- data/src/test/resources/temp/dummy +0 -0
- data/src/test/resources/yml/test_combined.yml +13 -0
- data/src/test/resources/yml/test_combined2.yml +13 -0
- data/src/test/resources/yml/test_common.yml +13 -0
- metadata +115 -0
@@ -0,0 +1,61 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Guess
|
3
|
+
|
4
|
+
# TODO implement guess plugin to make this command work:
|
5
|
+
# $ embulk guess -g "apache-log" partial-config.yml
|
6
|
+
#
|
7
|
+
# Depending on the file format the plugin uses, you can use choose
|
8
|
+
# one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
|
9
|
+
# or line guess (LineGuessPlugin).
|
10
|
+
|
11
|
+
#class ApacheCustomLogParserGuessPlugin < GuessPlugin
|
12
|
+
# Plugin.register_guess("apache-log", self)
|
13
|
+
#
|
14
|
+
# def guess(config, sample_buffer)
|
15
|
+
# if sample_buffer[0,2] == GZIP_HEADER
|
16
|
+
# guessed = {}
|
17
|
+
# guessed["type"] = "apache-log"
|
18
|
+
# guessed["property1"] = "guessed-value"
|
19
|
+
# return {"parser" => guessed}
|
20
|
+
# else
|
21
|
+
# return {}
|
22
|
+
# end
|
23
|
+
# end
|
24
|
+
#end
|
25
|
+
|
26
|
+
#class ApacheLogParserGuessPlugin < TextGuessPlugin
|
27
|
+
# Plugin.register_guess("apache-log", self)
|
28
|
+
#
|
29
|
+
# def guess_text(config, sample_text)
|
30
|
+
# js = JSON.parse(sample_text) rescue nil
|
31
|
+
# if js && js["mykeyword"] == "keyword"
|
32
|
+
# guessed = {}
|
33
|
+
# guessed["type"] = "apache-log"
|
34
|
+
# guessed["property1"] = "guessed-value"
|
35
|
+
# return {"parser" => guessed}
|
36
|
+
# else
|
37
|
+
# return {}
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
#end
|
41
|
+
|
42
|
+
#class ApacheLogParserGuessPlugin < LineGuessPlugin
|
43
|
+
# Plugin.register_guess("apache-log", self)
|
44
|
+
#
|
45
|
+
# def guess_lines(config, sample_lines)
|
46
|
+
# all_line_matched = sample_lines.all? do |line|
|
47
|
+
# line =~ /mypattern/
|
48
|
+
# end
|
49
|
+
# if all_line_matched
|
50
|
+
# guessed = {}
|
51
|
+
# guessed["type"] = "apache-log"
|
52
|
+
# guessed["property1"] = "guessed-value"
|
53
|
+
# return {"parser" => guessed}
|
54
|
+
# else
|
55
|
+
# return {}
|
56
|
+
# end
|
57
|
+
# end
|
58
|
+
#end
|
59
|
+
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
package org.embulk.parser;
|
2
|
+
|
3
|
+
import com.google.common.collect.Lists;
|
4
|
+
import org.embulk.config.Config;
|
5
|
+
import org.embulk.config.ConfigSource;
|
6
|
+
import org.embulk.config.Task;
|
7
|
+
import org.embulk.config.TaskSource;
|
8
|
+
import org.embulk.parser.apache.log.LogElement;
|
9
|
+
import org.embulk.parser.apache.log.LogFormats;
|
10
|
+
import org.embulk.parser.apache.log.Replacement;
|
11
|
+
import org.embulk.spi.*;
|
12
|
+
import org.embulk.spi.time.TimestampParser;
|
13
|
+
import org.embulk.spi.util.LineDecoder;
|
14
|
+
import org.slf4j.Logger;
|
15
|
+
import org.slf4j.LoggerFactory;
|
16
|
+
|
17
|
+
import java.util.ArrayList;
|
18
|
+
import java.util.List;
|
19
|
+
import java.util.regex.Matcher;
|
20
|
+
import java.util.regex.Pattern;
|
21
|
+
|
22
|
+
public class ApacheCustomLogParserPlugin
|
23
|
+
implements ParserPlugin
|
24
|
+
{
|
25
|
+
|
26
|
+
private static final Logger logger = LoggerFactory.getLogger(ApacheCustomLogParserPlugin.class);
|
27
|
+
|
28
|
+
public interface PluginTask
|
29
|
+
extends Task, LineDecoder.DecoderTask, TimestampParser.Task
|
30
|
+
{
|
31
|
+
|
32
|
+
@Config("format")
|
33
|
+
String getFormat();
|
34
|
+
|
35
|
+
}
|
36
|
+
|
37
|
+
@Override
|
38
|
+
public void transaction(ConfigSource config, ParserPlugin.Control control)
|
39
|
+
{
|
40
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
41
|
+
ArrayList<ColumnConfig> columns = Lists.newArrayList();
|
42
|
+
final String format = task.getFormat();
|
43
|
+
|
44
|
+
List<Replacement> replacements = new LogFormats(task).getReplacements(format);
|
45
|
+
|
46
|
+
for (Replacement replacement : replacements) {
|
47
|
+
LogElement<?> logElement = replacement.getLogElement();
|
48
|
+
columns.add(logElement.getColumnConfig(config));
|
49
|
+
}
|
50
|
+
|
51
|
+
Schema schema = new SchemaConfig(columns).toSchema();
|
52
|
+
control.run(task.dump(), schema);
|
53
|
+
}
|
54
|
+
|
55
|
+
@Override
|
56
|
+
public void run(TaskSource taskSource, Schema schema,
|
57
|
+
FileInput input, PageOutput output)
|
58
|
+
{
|
59
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
60
|
+
LineDecoder lineDecoder = new LineDecoder(input,task);
|
61
|
+
PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
|
62
|
+
String line;
|
63
|
+
final String format = task.getFormat();
|
64
|
+
LogFormats logFormats = new LogFormats(task);
|
65
|
+
|
66
|
+
List<Replacement> replacements = logFormats.getReplacements(format);
|
67
|
+
|
68
|
+
String regexp = logFormats.logFormat2RegexpString(format);
|
69
|
+
|
70
|
+
logger.info("LogFormat : " + format);
|
71
|
+
logger.info("RegExp : " + regexp);
|
72
|
+
|
73
|
+
Pattern accessLogPattern = Pattern.compile("^" + regexp + "$", Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
|
74
|
+
Matcher accessLogEntryMatcher;
|
75
|
+
|
76
|
+
int replacementSize = replacements.size();
|
77
|
+
|
78
|
+
logger.info("replacement : " + replacementSize);
|
79
|
+
|
80
|
+
while( input.nextFile() ){
|
81
|
+
while(true){
|
82
|
+
line = lineDecoder.poll();
|
83
|
+
|
84
|
+
if (line == null) {
|
85
|
+
break;
|
86
|
+
}
|
87
|
+
|
88
|
+
accessLogEntryMatcher = accessLogPattern.matcher(line);
|
89
|
+
|
90
|
+
if(replacementSize != accessLogEntryMatcher.groupCount()){
|
91
|
+
logger.warn("group count mismatch. + expected : " + replacementSize);
|
92
|
+
}
|
93
|
+
|
94
|
+
while(accessLogEntryMatcher.find()){
|
95
|
+
for (int i = 0; i < replacementSize; i++) {
|
96
|
+
LogElement<?> logElement = replacements.get(i).getLogElement();
|
97
|
+
String value = accessLogEntryMatcher.group(i + 1);
|
98
|
+
|
99
|
+
logElement.setToPageBuilder(pageBuilder, i, value);
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
103
|
+
pageBuilder.addRecord();
|
104
|
+
}
|
105
|
+
}
|
106
|
+
pageBuilder.finish();
|
107
|
+
}
|
108
|
+
|
109
|
+
}
|
@@ -0,0 +1,41 @@
|
|
1
|
+
package org.embulk.parser.apache.log;
|
2
|
+
|
3
|
+
|
4
|
+
import org.embulk.config.ConfigSource;
|
5
|
+
import org.embulk.spi.ColumnConfig;
|
6
|
+
import org.embulk.spi.PageBuilder;
|
7
|
+
import org.embulk.spi.type.Type;
|
8
|
+
|
9
|
+
public abstract class LogElement<T> {
|
10
|
+
|
11
|
+
protected String name;
|
12
|
+
protected String regexp;
|
13
|
+
protected final Type outputType;
|
14
|
+
|
15
|
+
public LogElement(String name, String regex, Type outputType){
|
16
|
+
this.name = name;
|
17
|
+
this.regexp = regex;
|
18
|
+
this.outputType = outputType;
|
19
|
+
}
|
20
|
+
|
21
|
+
public String getName(){
|
22
|
+
return name;
|
23
|
+
}
|
24
|
+
|
25
|
+
public String getRegexp() {
|
26
|
+
return regexp;
|
27
|
+
}
|
28
|
+
|
29
|
+
public Type getOutputType() {
|
30
|
+
return outputType;
|
31
|
+
}
|
32
|
+
|
33
|
+
public abstract T parse(String s);
|
34
|
+
|
35
|
+
public abstract void setToPageBuilder(PageBuilder pageBuilder, int i, String value);
|
36
|
+
|
37
|
+
public ColumnConfig getColumnConfig(ConfigSource config){
|
38
|
+
return new ColumnConfig(name, outputType, config);
|
39
|
+
}
|
40
|
+
|
41
|
+
}
|
@@ -0,0 +1,152 @@
|
|
1
|
+
package org.embulk.parser.apache.log;
|
2
|
+
|
3
|
+
import com.google.common.collect.Lists;
|
4
|
+
import org.embulk.spi.time.TimestampParser;
|
5
|
+
|
6
|
+
import java.util.*;
|
7
|
+
import java.util.regex.Matcher;
|
8
|
+
import java.util.regex.Pattern;
|
9
|
+
|
10
|
+
|
11
|
+
public class LogFormats implements Patterns {
|
12
|
+
|
13
|
+
TimestampParser.Task task;
|
14
|
+
|
15
|
+
public LogFormats(TimestampParser.Task task) {
|
16
|
+
this.task = task;
|
17
|
+
}
|
18
|
+
|
19
|
+
public Map<String, LogElementFactory<? extends LogElement>> getLogElementMappings(){
|
20
|
+
|
21
|
+
Map<String, LogElementFactory<? extends LogElement>> mapping = new HashMap<>();
|
22
|
+
|
23
|
+
mapping.put("a", new StringLogElementFactory("remote-ip", IP_ADDRESS));
|
24
|
+
mapping.put("A", new StringLogElementFactory("local-ip", IP_ADDRESS));
|
25
|
+
mapping.put("b", new LongLogElementFactory("response-bytes"));
|
26
|
+
mapping.put("B", new LongLogElementFactory("response-bytes"));
|
27
|
+
mapping.put("C", new StringLogElementFactory("request-cookie"));
|
28
|
+
mapping.put("D", new LongLogElementFactory("request-process-time-us"));
|
29
|
+
mapping.put("e", new StringLogElementFactory("env"));
|
30
|
+
mapping.put("f", new StringLogElementFactory("file-name"));
|
31
|
+
mapping.put("h", new StringLogElementFactory("remote-host"));
|
32
|
+
mapping.put("H", new StringLogElementFactory("request-protocol", NON_SPACE));
|
33
|
+
mapping.put("i", new StringLogElementFactory("request-header"));
|
34
|
+
mapping.put("l", new StringLogElementFactory("remote-log-name", NON_SPACE));
|
35
|
+
mapping.put("m", new StringLogElementFactory("request-method", METHOD));
|
36
|
+
|
37
|
+
mapping.put("n", new StringLogElementFactory("module-note"));
|
38
|
+
mapping.put("o", new StringLogElementFactory("response-header"));
|
39
|
+
|
40
|
+
mapping.put("p", new LongLogElementFactory("request-port"));
|
41
|
+
|
42
|
+
mapping.put("P", new LongLogElementFactory("request-process"));
|
43
|
+
|
44
|
+
mapping.put("q", new StringLogElementFactory("request-query", QUERY));
|
45
|
+
|
46
|
+
mapping.put("r", new StringLogElementFactory("request-line"));
|
47
|
+
mapping.put("s", new LongLogElementFactory("response-status", STATUS));
|
48
|
+
|
49
|
+
mapping.put("t", new TimestampLogElementFactory(task, "request-time"));
|
50
|
+
|
51
|
+
mapping.put("T", new LongLogElementFactory("request-process-time-s"));
|
52
|
+
|
53
|
+
mapping.put("u", new StringLogElementFactory("request-user"));
|
54
|
+
mapping.put("U", new StringLogElementFactory("request-path", PATH));
|
55
|
+
mapping.put("v", new StringLogElementFactory("request-server-name", NON_SPACE));
|
56
|
+
mapping.put("V", new StringLogElementFactory("canonical-server-name", NON_SPACE));
|
57
|
+
mapping.put("X", new StringLogElementFactory("connection-status", CONN_STATUS));
|
58
|
+
mapping.put("I", new LongLogElementFactory("request-total-bytes"));
|
59
|
+
mapping.put("O", new LongLogElementFactory("response-total-bytes"));
|
60
|
+
|
61
|
+
mapping.put("%", new StringLogElementFactory("%", "(¥¥%)"));
|
62
|
+
|
63
|
+
return mapping;
|
64
|
+
}
|
65
|
+
|
66
|
+
/**
|
67
|
+
* RegExp pattern of extract log format key
|
68
|
+
*
|
69
|
+
* this pattern has 9 groups, which are described as below.
|
70
|
+
*
|
71
|
+
* (%((!)?(\d{3}(,\d{3})*))?(<|>)?(\{([^\}]+)\})?([A-z]))
|
72
|
+
* | || | | | | | |- group(9) key
|
73
|
+
* | || | | | | |------------- group(8) optional parameter
|
74
|
+
* | || | | | |---------------- group(7) optional parameter wrapper group
|
75
|
+
* | || | | |---------------------- group(6) logging timing parameter
|
76
|
+
* | || | |---------------------------------- group(5) additional http status(es)
|
77
|
+
* | || |---------------------------------------- group(4) http status(es)
|
78
|
+
* | ||-------------------------------------------- group(3) inverse http status specifier
|
79
|
+
* | |--------------------------------------------- group(2) http status specifier
|
80
|
+
* |----------------------------------------------- group(0), group(1)
|
81
|
+
*
|
82
|
+
*/
|
83
|
+
public static final Pattern logFormatExtractor =
|
84
|
+
Pattern.compile("(%((!)?(\\d{3}(,\\d{3})*))?(<|>)?(\\{([^\\}]+)\\})?([A-z]))",
|
85
|
+
Pattern.DOTALL);
|
86
|
+
|
87
|
+
/**
|
88
|
+
* Convert logFormat String to Regexp String
|
89
|
+
* @param logFormat apache custom log format
|
90
|
+
* @return The pattern that matches CustomLog Configuration.
|
91
|
+
*
|
92
|
+
*/
|
93
|
+
public String logFormat2RegexpString(String logFormat){
|
94
|
+
List<Replacement> replacements = getReplacements(logFormat);
|
95
|
+
return replace(logFormat, replacements);
|
96
|
+
}
|
97
|
+
|
98
|
+
private String replace(String logFormat, List<Replacement> replacements) {
|
99
|
+
int offset = 0;
|
100
|
+
|
101
|
+
for (Replacement replacement : replacements) {
|
102
|
+
String left = logFormat.substring(0, offset + replacement.getStart());
|
103
|
+
String right = logFormat.substring(offset + replacement.getEnd(), logFormat.length());
|
104
|
+
int originalLength = logFormat.length() - left.length() - right.length();
|
105
|
+
|
106
|
+
String regexp = replacement.getLogElement().getRegexp();
|
107
|
+
logFormat = left + regexp + right;
|
108
|
+
offset += regexp.length() - originalLength;
|
109
|
+
}
|
110
|
+
return logFormat;
|
111
|
+
}
|
112
|
+
|
113
|
+
public List<Replacement> getReplacements(String logFormat) {
|
114
|
+
Matcher matcher = logFormatExtractor.matcher(logFormat);
|
115
|
+
|
116
|
+
List<Replacement> replacements = Lists.newArrayList();
|
117
|
+
|
118
|
+
while(matcher.find()){
|
119
|
+
if(matcher.groupCount() != 9){
|
120
|
+
throw new IllegalArgumentException("invalid regexp pattern");
|
121
|
+
}
|
122
|
+
String all = empty(matcher.group(1));
|
123
|
+
|
124
|
+
//TODO implement
|
125
|
+
//String ignoreStatus = empty(matcher.group(3));
|
126
|
+
//Object[] statuses = Arrays.stream(empty(matcher.group(4)).split(",")).toArray();
|
127
|
+
//String position = empty(matcher.group(6));
|
128
|
+
|
129
|
+
String parameter = matcher.group(8);
|
130
|
+
String key = empty(matcher.group(9));
|
131
|
+
|
132
|
+
LogElementFactory<? extends LogElement> factory = getLogElementMappings().get(key);
|
133
|
+
|
134
|
+
if(factory != null){
|
135
|
+
int start = matcher.start();
|
136
|
+
int end = matcher.end();
|
137
|
+
LogElement logElement = factory.create(parameter);
|
138
|
+
replacements.add(new Replacement(start, end, logElement));
|
139
|
+
}else{
|
140
|
+
throw new IllegalStateException("unknown log format key " + all);
|
141
|
+
}
|
142
|
+
|
143
|
+
}
|
144
|
+
return replacements;
|
145
|
+
}
|
146
|
+
|
147
|
+
private String empty(String s){
|
148
|
+
return s == null ? "" : s;
|
149
|
+
}
|
150
|
+
|
151
|
+
|
152
|
+
}
|
@@ -0,0 +1,29 @@
|
|
1
|
+
package org.embulk.parser.apache.log;
|
2
|
+
|
3
|
+
|
4
|
+
import org.embulk.spi.PageBuilder;
|
5
|
+
import org.embulk.spi.type.Types;
|
6
|
+
|
7
|
+
public class LongLogElement extends LogElement<Long> {
|
8
|
+
|
9
|
+
public LongLogElement(String name, String regex) {
|
10
|
+
super(name, regex, Types.LONG);
|
11
|
+
}
|
12
|
+
|
13
|
+
@Override
|
14
|
+
public Long parse(String s) {
|
15
|
+
try{
|
16
|
+
if("-".equals(s)){
|
17
|
+
return 0L;
|
18
|
+
}
|
19
|
+
return Long.parseLong(s);
|
20
|
+
}catch (NumberFormatException e){
|
21
|
+
return 0L;
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public void setToPageBuilder(PageBuilder pageBuilder, int i, String value) {
|
27
|
+
pageBuilder.setLong(i, parse(value));
|
28
|
+
}
|
29
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
package org.embulk.parser.apache.log;
|
2
|
+
|
3
|
+
|
4
|
+
import org.apache.commons.lang3.StringUtils;
|
5
|
+
|
6
|
+
public class LongLogElementFactory implements LogElementFactory<LongLogElement>, Patterns {
|
7
|
+
|
8
|
+
private String name;
|
9
|
+
private String regexp;
|
10
|
+
|
11
|
+
public LongLogElementFactory(String name, String regexp) {
|
12
|
+
this.name = name;
|
13
|
+
this.regexp = regexp;
|
14
|
+
}
|
15
|
+
|
16
|
+
public LongLogElementFactory(String name) {
|
17
|
+
this.name = name;
|
18
|
+
this.regexp = LONG;
|
19
|
+
}
|
20
|
+
|
21
|
+
@Override
|
22
|
+
public LongLogElement create(String parameter) {
|
23
|
+
if(StringUtils.isEmpty(parameter)){
|
24
|
+
return new LongLogElement(name, regexp);
|
25
|
+
}else {
|
26
|
+
return new LongLogElement(name + "-" + parameter, regexp);
|
27
|
+
}
|
28
|
+
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.parser.apache.log;
|
2
|
+
|
3
|
+
public interface Patterns {
|
4
|
+
|
5
|
+
String NON_SPACE = "([^\\s]*)";
|
6
|
+
|
7
|
+
String IP_ADDRESS = "(\\d+(?:\\.\\d+){3})";
|
8
|
+
|
9
|
+
String LONG = "(-?\\d+|-)";
|
10
|
+
|
11
|
+
String ANY = "(.*)";
|
12
|
+
|
13
|
+
String PATH = "(/[^\\?]*)";
|
14
|
+
|
15
|
+
String QUERY = "(\\?.*)?";
|
16
|
+
|
17
|
+
String STATUS = "([1-9]\\d{2})";
|
18
|
+
|
19
|
+
String METHOD = "(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|CONNECT)";
|
20
|
+
|
21
|
+
String CONN_STATUS = "([X+\\-])";
|
22
|
+
|
23
|
+
}
|