embulk-parser-apache-custom-log 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/CHANGES.md +9 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +76 -0
  6. data/build.gradle +74 -0
  7. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  8. data/gradle/wrapper/gradle-wrapper.properties +6 -0
  9. data/gradlew +164 -0
  10. data/gradlew.bat +90 -0
  11. data/lib/embulk/guess/apache-custom-log.rb +61 -0
  12. data/lib/embulk/parser/apache-custom-log.rb +3 -0
  13. data/src/main/java/org/embulk/parser/ApacheCustomLogParserPlugin.java +109 -0
  14. data/src/main/java/org/embulk/parser/apache/log/LogElement.java +41 -0
  15. data/src/main/java/org/embulk/parser/apache/log/LogElementFactory.java +6 -0
  16. data/src/main/java/org/embulk/parser/apache/log/LogFormats.java +152 -0
  17. data/src/main/java/org/embulk/parser/apache/log/LongLogElement.java +29 -0
  18. data/src/main/java/org/embulk/parser/apache/log/LongLogElementFactory.java +30 -0
  19. data/src/main/java/org/embulk/parser/apache/log/Patterns.java +23 -0
  20. data/src/main/java/org/embulk/parser/apache/log/Replacement.java +27 -0
  21. data/src/main/java/org/embulk/parser/apache/log/StringLogElement.java +33 -0
  22. data/src/main/java/org/embulk/parser/apache/log/StringLogElementFactory.java +29 -0
  23. data/src/main/java/org/embulk/parser/apache/log/TimestampLogElement.java +42 -0
  24. data/src/main/java/org/embulk/parser/apache/log/TimestampLogElementFactory.java +24 -0
  25. data/src/test/java/org/embulk/parser/TestApacheLogParserPlugin.java +162 -0
  26. data/src/test/java/org/embulk/parser/apache/log/LogFormatsTest.java +39 -0
  27. data/src/test/java/org/embulk/parser/apache/log/PatternsTest.java +120 -0
  28. data/src/test/java/org/embulk/parser/apache/log/StringLogElementFactoryTest.java +91 -0
  29. data/src/test/java/org/embulk/parser/apache/log/StringLogElementTest.java +51 -0
  30. data/src/test/java/org/embulk/tester/DummyConfigSource.java +86 -0
  31. data/src/test/java/org/embulk/tester/EmbulkPluginTester.java +52 -0
  32. data/src/test/java/org/embulk/tester/TestExtension.java +52 -0
  33. data/src/test/resources/META-INF/services/org.embulk.spi.Extension +1 -0
  34. data/src/test/resources/data/access_log_2_combined +1 -0
  35. data/src/test/resources/data/access_log_combined +2 -0
  36. data/src/test/resources/data/access_log_common +1 -0
  37. data/src/test/resources/resource.txt +0 -0
  38. data/src/test/resources/temp/dummy +0 -0
  39. data/src/test/resources/yml/test_combined.yml +13 -0
  40. data/src/test/resources/yml/test_combined2.yml +13 -0
  41. data/src/test/resources/yml/test_common.yml +13 -0
  42. metadata +115 -0
@@ -0,0 +1,61 @@
1
+ module Embulk
2
+ module Guess
3
+
4
+ # TODO implement guess plugin to make this command work:
5
+ # $ embulk guess -g "apache-log" partial-config.yml
6
+ #
7
+ # Depending on the file format the plugin uses, you can use choose
8
+ # one of binary guess (GuessPlugin), text guess (TextGuessPlugin),
9
+ # or line guess (LineGuessPlugin).
10
+
11
+ #class ApacheCustomLogParserGuessPlugin < GuessPlugin
12
+ # Plugin.register_guess("apache-log", self)
13
+ #
14
+ # def guess(config, sample_buffer)
15
+ # if sample_buffer[0,2] == GZIP_HEADER
16
+ # guessed = {}
17
+ # guessed["type"] = "apache-log"
18
+ # guessed["property1"] = "guessed-value"
19
+ # return {"parser" => guessed}
20
+ # else
21
+ # return {}
22
+ # end
23
+ # end
24
+ #end
25
+
26
+ #class ApacheLogParserGuessPlugin < TextGuessPlugin
27
+ # Plugin.register_guess("apache-log", self)
28
+ #
29
+ # def guess_text(config, sample_text)
30
+ # js = JSON.parse(sample_text) rescue nil
31
+ # if js && js["mykeyword"] == "keyword"
32
+ # guessed = {}
33
+ # guessed["type"] = "apache-log"
34
+ # guessed["property1"] = "guessed-value"
35
+ # return {"parser" => guessed}
36
+ # else
37
+ # return {}
38
+ # end
39
+ # end
40
+ #end
41
+
42
+ #class ApacheLogParserGuessPlugin < LineGuessPlugin
43
+ # Plugin.register_guess("apache-log", self)
44
+ #
45
+ # def guess_lines(config, sample_lines)
46
+ # all_line_matched = sample_lines.all? do |line|
47
+ # line =~ /mypattern/
48
+ # end
49
+ # if all_line_matched
50
+ # guessed = {}
51
+ # guessed["type"] = "apache-log"
52
+ # guessed["property1"] = "guessed-value"
53
+ # return {"parser" => guessed}
54
+ # else
55
+ # return {}
56
+ # end
57
+ # end
58
+ #end
59
+
60
+ end
61
+ end
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_parser(
2
+ "apache-custom-log", "org.embulk.parser.ApacheCustomLogParserPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,109 @@
1
+ package org.embulk.parser;
2
+
3
+ import com.google.common.collect.Lists;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigSource;
6
+ import org.embulk.config.Task;
7
+ import org.embulk.config.TaskSource;
8
+ import org.embulk.parser.apache.log.LogElement;
9
+ import org.embulk.parser.apache.log.LogFormats;
10
+ import org.embulk.parser.apache.log.Replacement;
11
+ import org.embulk.spi.*;
12
+ import org.embulk.spi.time.TimestampParser;
13
+ import org.embulk.spi.util.LineDecoder;
14
+ import org.slf4j.Logger;
15
+ import org.slf4j.LoggerFactory;
16
+
17
+ import java.util.ArrayList;
18
+ import java.util.List;
19
+ import java.util.regex.Matcher;
20
+ import java.util.regex.Pattern;
21
+
22
+ public class ApacheCustomLogParserPlugin
23
+ implements ParserPlugin
24
+ {
25
+
26
+ private static final Logger logger = LoggerFactory.getLogger(ApacheCustomLogParserPlugin.class);
27
+
28
+ public interface PluginTask
29
+ extends Task, LineDecoder.DecoderTask, TimestampParser.Task
30
+ {
31
+
32
+ @Config("format")
33
+ String getFormat();
34
+
35
+ }
36
+
37
+ @Override
38
+ public void transaction(ConfigSource config, ParserPlugin.Control control)
39
+ {
40
+ PluginTask task = config.loadConfig(PluginTask.class);
41
+ ArrayList<ColumnConfig> columns = Lists.newArrayList();
42
+ final String format = task.getFormat();
43
+
44
+ List<Replacement> replacements = new LogFormats(task).getReplacements(format);
45
+
46
+ for (Replacement replacement : replacements) {
47
+ LogElement<?> logElement = replacement.getLogElement();
48
+ columns.add(logElement.getColumnConfig(config));
49
+ }
50
+
51
+ Schema schema = new SchemaConfig(columns).toSchema();
52
+ control.run(task.dump(), schema);
53
+ }
54
+
55
+ @Override
56
+ public void run(TaskSource taskSource, Schema schema,
57
+ FileInput input, PageOutput output)
58
+ {
59
+ PluginTask task = taskSource.loadTask(PluginTask.class);
60
+ LineDecoder lineDecoder = new LineDecoder(input,task);
61
+ PageBuilder pageBuilder = new PageBuilder(Exec.getBufferAllocator(), schema, output);
62
+ String line;
63
+ final String format = task.getFormat();
64
+ LogFormats logFormats = new LogFormats(task);
65
+
66
+ List<Replacement> replacements = logFormats.getReplacements(format);
67
+
68
+ String regexp = logFormats.logFormat2RegexpString(format);
69
+
70
+ logger.info("LogFormat : " + format);
71
+ logger.info("RegExp : " + regexp);
72
+
73
+ Pattern accessLogPattern = Pattern.compile("^" + regexp + "$", Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
74
+ Matcher accessLogEntryMatcher;
75
+
76
+ int replacementSize = replacements.size();
77
+
78
+ logger.info("replacement : " + replacementSize);
79
+
80
+ while( input.nextFile() ){
81
+ while(true){
82
+ line = lineDecoder.poll();
83
+
84
+ if (line == null) {
85
+ break;
86
+ }
87
+
88
+ accessLogEntryMatcher = accessLogPattern.matcher(line);
89
+
90
+ if(replacementSize != accessLogEntryMatcher.groupCount()){
91
+ logger.warn("group count mismatch. + expected : " + replacementSize);
92
+ }
93
+
94
+ while(accessLogEntryMatcher.find()){
95
+ for (int i = 0; i < replacementSize; i++) {
96
+ LogElement<?> logElement = replacements.get(i).getLogElement();
97
+ String value = accessLogEntryMatcher.group(i + 1);
98
+
99
+ logElement.setToPageBuilder(pageBuilder, i, value);
100
+ }
101
+ }
102
+
103
+ pageBuilder.addRecord();
104
+ }
105
+ }
106
+ pageBuilder.finish();
107
+ }
108
+
109
+ }
@@ -0,0 +1,41 @@
1
+ package org.embulk.parser.apache.log;
2
+
3
+
4
+ import org.embulk.config.ConfigSource;
5
+ import org.embulk.spi.ColumnConfig;
6
+ import org.embulk.spi.PageBuilder;
7
+ import org.embulk.spi.type.Type;
8
+
9
+ public abstract class LogElement<T> {
10
+
11
+ protected String name;
12
+ protected String regexp;
13
+ protected final Type outputType;
14
+
15
+ public LogElement(String name, String regex, Type outputType){
16
+ this.name = name;
17
+ this.regexp = regex;
18
+ this.outputType = outputType;
19
+ }
20
+
21
+ public String getName(){
22
+ return name;
23
+ }
24
+
25
+ public String getRegexp() {
26
+ return regexp;
27
+ }
28
+
29
+ public Type getOutputType() {
30
+ return outputType;
31
+ }
32
+
33
+ public abstract T parse(String s);
34
+
35
+ public abstract void setToPageBuilder(PageBuilder pageBuilder, int i, String value);
36
+
37
+ public ColumnConfig getColumnConfig(ConfigSource config){
38
+ return new ColumnConfig(name, outputType, config);
39
+ }
40
+
41
+ }
@@ -0,0 +1,6 @@
1
+ package org.embulk.parser.apache.log;
2
+
3
+
4
+ public interface LogElementFactory<T extends LogElement> {
5
+ T create(String parameter);
6
+ }
@@ -0,0 +1,152 @@
1
+ package org.embulk.parser.apache.log;
2
+
3
+ import com.google.common.collect.Lists;
4
+ import org.embulk.spi.time.TimestampParser;
5
+
6
+ import java.util.*;
7
+ import java.util.regex.Matcher;
8
+ import java.util.regex.Pattern;
9
+
10
+
11
+ public class LogFormats implements Patterns {
12
+
13
+ TimestampParser.Task task;
14
+
15
+ public LogFormats(TimestampParser.Task task) {
16
+ this.task = task;
17
+ }
18
+
19
+ public Map<String, LogElementFactory<? extends LogElement>> getLogElementMappings(){
20
+
21
+ Map<String, LogElementFactory<? extends LogElement>> mapping = new HashMap<>();
22
+
23
+ mapping.put("a", new StringLogElementFactory("remote-ip", IP_ADDRESS));
24
+ mapping.put("A", new StringLogElementFactory("local-ip", IP_ADDRESS));
25
+ mapping.put("b", new LongLogElementFactory("response-bytes"));
26
+ mapping.put("B", new LongLogElementFactory("response-bytes"));
27
+ mapping.put("C", new StringLogElementFactory("request-cookie"));
28
+ mapping.put("D", new LongLogElementFactory("request-process-time-us"));
29
+ mapping.put("e", new StringLogElementFactory("env"));
30
+ mapping.put("f", new StringLogElementFactory("file-name"));
31
+ mapping.put("h", new StringLogElementFactory("remote-host"));
32
+ mapping.put("H", new StringLogElementFactory("request-protocol", NON_SPACE));
33
+ mapping.put("i", new StringLogElementFactory("request-header"));
34
+ mapping.put("l", new StringLogElementFactory("remote-log-name", NON_SPACE));
35
+ mapping.put("m", new StringLogElementFactory("request-method", METHOD));
36
+
37
+ mapping.put("n", new StringLogElementFactory("module-note"));
38
+ mapping.put("o", new StringLogElementFactory("response-header"));
39
+
40
+ mapping.put("p", new LongLogElementFactory("request-port"));
41
+
42
+ mapping.put("P", new LongLogElementFactory("request-process"));
43
+
44
+ mapping.put("q", new StringLogElementFactory("request-query", QUERY));
45
+
46
+ mapping.put("r", new StringLogElementFactory("request-line"));
47
+ mapping.put("s", new LongLogElementFactory("response-status", STATUS));
48
+
49
+ mapping.put("t", new TimestampLogElementFactory(task, "request-time"));
50
+
51
+ mapping.put("T", new LongLogElementFactory("request-process-time-s"));
52
+
53
+ mapping.put("u", new StringLogElementFactory("request-user"));
54
+ mapping.put("U", new StringLogElementFactory("request-path", PATH));
55
+ mapping.put("v", new StringLogElementFactory("request-server-name", NON_SPACE));
56
+ mapping.put("V", new StringLogElementFactory("canonical-server-name", NON_SPACE));
57
+ mapping.put("X", new StringLogElementFactory("connection-status", CONN_STATUS));
58
+ mapping.put("I", new LongLogElementFactory("request-total-bytes"));
59
+ mapping.put("O", new LongLogElementFactory("response-total-bytes"));
60
+
61
+ mapping.put("%", new StringLogElementFactory("%", "(¥¥%)"));
62
+
63
+ return mapping;
64
+ }
65
+
66
+ /**
67
+ * RegExp pattern of extract log format key
68
+ *
69
+ * this pattern has 9 groups, which are described as below.
70
+ *
71
+ * (%((!)?(\d{3}(,\d{3})*))?(<|>)?(\{([^\}]+)\})?([A-z]))
72
+ * | || | | | | | |- group(9) key
73
+ * | || | | | | |------------- group(8) optional parameter
74
+ * | || | | | |---------------- group(7) optional parameter wrapper group
75
+ * | || | | |---------------------- group(6) logging timing parameter
76
+ * | || | |---------------------------------- group(5) additional http status(es)
77
+ * | || |---------------------------------------- group(4) http status(es)
78
+ * | ||-------------------------------------------- group(3) inverse http status specifier
79
+ * | |--------------------------------------------- group(2) http status specifier
80
+ * |----------------------------------------------- group(0), group(1)
81
+ *
82
+ */
83
+ public static final Pattern logFormatExtractor =
84
+ Pattern.compile("(%((!)?(\\d{3}(,\\d{3})*))?(<|>)?(\\{([^\\}]+)\\})?([A-z]))",
85
+ Pattern.DOTALL);
86
+
87
+ /**
88
+ * Convert logFormat String to Regexp String
89
+ * @param logFormat apache custom log format
90
+ * @return The pattern that matches CustomLog Configuration.
91
+ *
92
+ */
93
+ public String logFormat2RegexpString(String logFormat){
94
+ List<Replacement> replacements = getReplacements(logFormat);
95
+ return replace(logFormat, replacements);
96
+ }
97
+
98
+ private String replace(String logFormat, List<Replacement> replacements) {
99
+ int offset = 0;
100
+
101
+ for (Replacement replacement : replacements) {
102
+ String left = logFormat.substring(0, offset + replacement.getStart());
103
+ String right = logFormat.substring(offset + replacement.getEnd(), logFormat.length());
104
+ int originalLength = logFormat.length() - left.length() - right.length();
105
+
106
+ String regexp = replacement.getLogElement().getRegexp();
107
+ logFormat = left + regexp + right;
108
+ offset += regexp.length() - originalLength;
109
+ }
110
+ return logFormat;
111
+ }
112
+
113
+ public List<Replacement> getReplacements(String logFormat) {
114
+ Matcher matcher = logFormatExtractor.matcher(logFormat);
115
+
116
+ List<Replacement> replacements = Lists.newArrayList();
117
+
118
+ while(matcher.find()){
119
+ if(matcher.groupCount() != 9){
120
+ throw new IllegalArgumentException("invalid regexp pattern");
121
+ }
122
+ String all = empty(matcher.group(1));
123
+
124
+ //TODO implement
125
+ //String ignoreStatus = empty(matcher.group(3));
126
+ //Object[] statuses = Arrays.stream(empty(matcher.group(4)).split(",")).toArray();
127
+ //String position = empty(matcher.group(6));
128
+
129
+ String parameter = matcher.group(8);
130
+ String key = empty(matcher.group(9));
131
+
132
+ LogElementFactory<? extends LogElement> factory = getLogElementMappings().get(key);
133
+
134
+ if(factory != null){
135
+ int start = matcher.start();
136
+ int end = matcher.end();
137
+ LogElement logElement = factory.create(parameter);
138
+ replacements.add(new Replacement(start, end, logElement));
139
+ }else{
140
+ throw new IllegalStateException("unknown log format key " + all);
141
+ }
142
+
143
+ }
144
+ return replacements;
145
+ }
146
+
147
+ private String empty(String s){
148
+ return s == null ? "" : s;
149
+ }
150
+
151
+
152
+ }
@@ -0,0 +1,29 @@
1
+ package org.embulk.parser.apache.log;
2
+
3
+
4
+ import org.embulk.spi.PageBuilder;
5
+ import org.embulk.spi.type.Types;
6
+
7
+ public class LongLogElement extends LogElement<Long> {
8
+
9
+ public LongLogElement(String name, String regex) {
10
+ super(name, regex, Types.LONG);
11
+ }
12
+
13
+ @Override
14
+ public Long parse(String s) {
15
+ try{
16
+ if("-".equals(s)){
17
+ return 0L;
18
+ }
19
+ return Long.parseLong(s);
20
+ }catch (NumberFormatException e){
21
+ return 0L;
22
+ }
23
+ }
24
+
25
+ @Override
26
+ public void setToPageBuilder(PageBuilder pageBuilder, int i, String value) {
27
+ pageBuilder.setLong(i, parse(value));
28
+ }
29
+ }
@@ -0,0 +1,30 @@
1
+ package org.embulk.parser.apache.log;
2
+
3
+
4
+ import org.apache.commons.lang3.StringUtils;
5
+
6
+ public class LongLogElementFactory implements LogElementFactory<LongLogElement>, Patterns {
7
+
8
+ private String name;
9
+ private String regexp;
10
+
11
+ public LongLogElementFactory(String name, String regexp) {
12
+ this.name = name;
13
+ this.regexp = regexp;
14
+ }
15
+
16
+ public LongLogElementFactory(String name) {
17
+ this.name = name;
18
+ this.regexp = LONG;
19
+ }
20
+
21
+ @Override
22
+ public LongLogElement create(String parameter) {
23
+ if(StringUtils.isEmpty(parameter)){
24
+ return new LongLogElement(name, regexp);
25
+ }else {
26
+ return new LongLogElement(name + "-" + parameter, regexp);
27
+ }
28
+
29
+ }
30
+ }
@@ -0,0 +1,23 @@
1
+ package org.embulk.parser.apache.log;
2
+
3
+ public interface Patterns {
4
+
5
+ String NON_SPACE = "([^\\s]*)";
6
+
7
+ String IP_ADDRESS = "(\\d+(?:\\.\\d+){3})";
8
+
9
+ String LONG = "(-?\\d+|-)";
10
+
11
+ String ANY = "(.*)";
12
+
13
+ String PATH = "(/[^\\?]*)";
14
+
15
+ String QUERY = "(\\?.*)?";
16
+
17
+ String STATUS = "([1-9]\\d{2})";
18
+
19
+ String METHOD = "(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|CONNECT)";
20
+
21
+ String CONN_STATUS = "([X+\\-])";
22
+
23
+ }