embulk-parser-regex 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +5 -5
- data/build.gradle +1 -1
- data/lib/embulk/guess/regex.rb +12 -12
- data/sample/apache_1/config.yml +4 -4
- data/sample/apache_2/config.yml +3 -3
- data/src/main/java/org/embulk/parser/regex/RegexParserPlugin.java +2 -2
- metadata +23 -25
- data/embulk-parser-regex.iml +0 -49
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 180e9d8ef6ff872d1aa840f937972ba228b2762c
|
4
|
+
data.tar.gz: 5ae14fb13a5ac890f8f4c7224cf14fbe660ddb22
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf0f5d97601bd217bc28b955d1046456d9231c9cbf7614c82fe90dfe70fb43716ce09ab3762e0bf2737ffc9a1e6774e23086e7a61c2b5526afc284619df89a0d
|
7
|
+
data.tar.gz: 7e1a23835f82c112616998ce5c121ddac25299ac0d91afb934b6a5f8218b295a561a7842446b261a79a83dea9d267dca1d60f827e3a72e99675c824b4ee40a7d
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ A simple parser Using Regular Expression.
|
|
11
11
|
|
12
12
|
- **regex**: regular expression that must use [Named Capturing Group](https://blogs.oracle.com/xuemingshen/entry/named_capturing_group_in_jdk7) (string, required)
|
13
13
|
- **columns**: column definition (list of object)
|
14
|
-
- **
|
14
|
+
- **regex_name**: 'Named Capturing Group' can only include `[a-zA-Z0-9]`, so alias group name in regex can be specified (string, default: `<name> attr value`)
|
15
15
|
- **skip_if_unmatch**: if false, when a line don't match the regex, raise RuntimeException. If true, skip the line. (boolean, default: `false`)
|
16
16
|
|
17
17
|
## Example
|
@@ -23,7 +23,7 @@ in:
|
|
23
23
|
type: regex
|
24
24
|
regex: ^(?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)" (?<inByte>[0-9]+) (?<outByte>[0-9]+)$
|
25
25
|
columns:
|
26
|
-
- {name: remote_host, type: string,
|
26
|
+
- {name: remote_host, type: string, regex_name: remoteHost}
|
27
27
|
- {name: identity, type: string}
|
28
28
|
- {name: user, type: string}
|
29
29
|
- {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
|
@@ -33,9 +33,9 @@ in:
|
|
33
33
|
- {name: status, type: long}
|
34
34
|
- {name: size, type: long}
|
35
35
|
- {name: referer, type: string}
|
36
|
-
- {name: user_agent, type: string,
|
37
|
-
- {name: in_byte, type: long,
|
38
|
-
- {name: out_byte, type: long,
|
36
|
+
- {name: user_agent, type: string, regex_name: userAgent}
|
37
|
+
- {name: in_byte, type: long, regex_name: inByte}
|
38
|
+
- {name: out_byte, type: long, regex_name: outByte}
|
39
39
|
```
|
40
40
|
|
41
41
|
### Guess
|
data/build.gradle
CHANGED
data/lib/embulk/guess/regex.rb
CHANGED
@@ -26,12 +26,12 @@ module Embulk
|
|
26
26
|
|
27
27
|
def apache_x_forwarded_for
|
28
28
|
RegexApacheLogGuesser.new
|
29
|
-
.ip_or_minus(:x_forwarded_for,
|
29
|
+
.ip_or_minus(:x_forwarded_for, regex_name: 'forwardedFor')
|
30
30
|
end
|
31
31
|
|
32
32
|
def apache_common(config, sample_lines)
|
33
33
|
RegexApacheLogGuesser.new
|
34
|
-
.ip(:remote_host,
|
34
|
+
.ip(:remote_host, regex_name: 'remoteHost').token(:identity).token(:user)
|
35
35
|
.kakko(:datetime, format: '%d/%b/%Y:%H:%M:%S %z', type: 'timestamp')
|
36
36
|
.method_path_protocol
|
37
37
|
.integer(:status).integer_or_minus(:size)
|
@@ -39,12 +39,12 @@ module Embulk
|
|
39
39
|
|
40
40
|
def apache_combined(config, sample_lines)
|
41
41
|
apache_common(config, sample_lines)
|
42
|
-
.string(:referer).string(:user_agent,
|
42
|
+
.string(:referer).string(:user_agent, regex_name: 'userAgent')
|
43
43
|
end
|
44
44
|
|
45
45
|
def apache_combinedio(config, sample_lines)
|
46
46
|
apache_combined(config, sample_lines)
|
47
|
-
.integer(:in_byte,
|
47
|
+
.integer(:in_byte, regex_name: 'inByte').integer(:out_byte, regex_name: 'outByte')
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
@@ -82,49 +82,49 @@ module Embulk
|
|
82
82
|
end
|
83
83
|
|
84
84
|
def ip(name, opts={})
|
85
|
-
@patterns << "(?<#{opts[:
|
85
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[.:0-9]+)"
|
86
86
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
87
87
|
self
|
88
88
|
end
|
89
89
|
|
90
90
|
def ip_or_minus(name, opts={})
|
91
|
-
@patterns << "(?<#{opts[:
|
91
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[.:0-9]+|-)"
|
92
92
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
93
93
|
self
|
94
94
|
end
|
95
95
|
|
96
96
|
def token(name, opts={})
|
97
|
-
@patterns << "(?<#{opts[:
|
97
|
+
@patterns << "(?<#{opts[:regex_name] || name}>\\S+)"
|
98
98
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
99
99
|
self
|
100
100
|
end
|
101
101
|
|
102
102
|
def string(name, opts={})
|
103
|
-
@patterns << "\"(?<#{opts[:
|
103
|
+
@patterns << "\"(?<#{opts[:regex_name] || name}>[^\"]*)\""
|
104
104
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
105
105
|
self
|
106
106
|
end
|
107
107
|
|
108
108
|
def string_or_minus(name, opts={})
|
109
|
-
@patterns << "\"(?<#{opts[:
|
109
|
+
@patterns << "\"(?<#{opts[:regex_name] || name}>[^\"]*|-)\""
|
110
110
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
111
111
|
self
|
112
112
|
end
|
113
113
|
|
114
114
|
def integer(name, opts={})
|
115
|
-
@patterns << "(?<#{opts[:
|
115
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[0-9]+)"
|
116
116
|
@columns << {:name => name, :type => 'long'}.merge(opts)
|
117
117
|
self
|
118
118
|
end
|
119
119
|
|
120
120
|
def integer_or_minus(name, opts={})
|
121
|
-
@patterns << "(?<#{opts[:
|
121
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[0-9]+|-)"
|
122
122
|
@columns << {:name => name, :type => 'long'}.merge(opts)
|
123
123
|
self
|
124
124
|
end
|
125
125
|
|
126
126
|
def kakko(name, opts={})
|
127
|
-
@patterns << "\\[(?<#{opts[:
|
127
|
+
@patterns << "\\[(?<#{opts[:regex_name] || name}>[^\\]]*)\\]"
|
128
128
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
129
129
|
self
|
130
130
|
end
|
data/sample/apache_1/config.yml
CHANGED
@@ -7,7 +7,7 @@ in:
|
|
7
7
|
type: regex
|
8
8
|
regex: ^(?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)" (?<inByte>[0-9]+) (?<outByte>[0-9]+)$
|
9
9
|
columns:
|
10
|
-
- {name: remote_host, type: string,
|
10
|
+
- {name: remote_host, type: string, regex_name: remoteHost}
|
11
11
|
- {name: identity, type: string}
|
12
12
|
- {name: user, type: string}
|
13
13
|
- {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
|
@@ -17,7 +17,7 @@ in:
|
|
17
17
|
- {name: status, type: long}
|
18
18
|
- {name: size, type: long}
|
19
19
|
- {name: referer, type: string}
|
20
|
-
- {name: user_agent, type: string,
|
21
|
-
- {name: in_byte, type: long,
|
22
|
-
- {name: out_byte, type: long,
|
20
|
+
- {name: user_agent, type: string, regex_name: userAgent}
|
21
|
+
- {name: in_byte, type: long, regex_name: inByte}
|
22
|
+
- {name: out_byte, type: long, regex_name: outByte}
|
23
23
|
out: {type: stdout}
|
data/sample/apache_2/config.yml
CHANGED
@@ -7,8 +7,8 @@ in:
|
|
7
7
|
type: regex
|
8
8
|
regex: ^(?<forwardedFor>[.:0-9]+|-) (?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)"$
|
9
9
|
columns:
|
10
|
-
- {name: x_forwarded_for, type: string,
|
11
|
-
- {name: remote_host, type: string,
|
10
|
+
- {name: x_forwarded_for, type: string, regex_name: forwardedFor}
|
11
|
+
- {name: remote_host, type: string, regex_name: remoteHost}
|
12
12
|
- {name: identity, type: string}
|
13
13
|
- {name: user, type: string}
|
14
14
|
- {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
|
@@ -18,5 +18,5 @@ in:
|
|
18
18
|
- {name: status, type: long}
|
19
19
|
- {name: size, type: long}
|
20
20
|
- {name: referer, type: string}
|
21
|
-
- {name: user_agent, type: string,
|
21
|
+
- {name: user_agent, type: string, regex_name: userAgent}
|
22
22
|
out: {type: stdout}
|
@@ -70,7 +70,7 @@ public class RegexParserPlugin implements ParserPlugin {
|
|
70
70
|
// TODO: How to Log?
|
71
71
|
continue;
|
72
72
|
} else {
|
73
|
-
throw new
|
73
|
+
throw new DataException("Unmatched Line: " + line);
|
74
74
|
}
|
75
75
|
}
|
76
76
|
|
@@ -100,7 +100,7 @@ public class RegexParserPlugin implements ParserPlugin {
|
|
100
100
|
String name = c.getName();
|
101
101
|
Type type = c.getType();
|
102
102
|
Column column = c.toColumn(index);
|
103
|
-
String regexName = c.getOption().get(String.class, "
|
103
|
+
String regexName = c.getOption().get(String.class, "regex_name", name);
|
104
104
|
|
105
105
|
DefaultValueSetter defaultValue = new NullDefaultValueSetter();
|
106
106
|
DynamicColumnSetter setter;
|
metadata
CHANGED
@@ -1,57 +1,54 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-regex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ken Morishita
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2015-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
-
|
15
|
+
version_requirements: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.0'
|
20
|
-
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
23
21
|
requirements:
|
24
|
-
- -
|
22
|
+
- - ~>
|
25
23
|
- !ruby/object:Gem::Version
|
26
24
|
version: '1.0'
|
25
|
+
prerelease: false
|
26
|
+
type: :development
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
|
-
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.0'
|
34
|
-
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
37
35
|
requirements:
|
38
|
-
- -
|
36
|
+
- - ~>
|
39
37
|
- !ruby/object:Gem::Version
|
40
38
|
version: '10.0'
|
41
|
-
|
42
|
-
|
39
|
+
prerelease: false
|
40
|
+
type: :development
|
41
|
+
description: Parses lines using regular-expression in files read by other file input plugins.
|
43
42
|
email:
|
44
43
|
- mokemokechicken@gmail.com
|
45
44
|
executables: []
|
46
45
|
extensions: []
|
47
46
|
extra_rdoc_files: []
|
48
47
|
files:
|
49
|
-
-
|
48
|
+
- .gitignore
|
50
49
|
- LICENSE.txt
|
51
50
|
- README.md
|
52
51
|
- build.gradle
|
53
|
-
- classpath/embulk-parser-regex-0.1.0.jar
|
54
|
-
- embulk-parser-regex.iml
|
55
52
|
- gradle/wrapper/gradle-wrapper.jar
|
56
53
|
- gradle/wrapper/gradle-wrapper.properties
|
57
54
|
- gradlew
|
@@ -68,28 +65,29 @@ files:
|
|
68
65
|
- sample/simple/data_simple_1.txt
|
69
66
|
- src/main/java/org/embulk/parser/regex/RegexParserPlugin.java
|
70
67
|
- src/test/java/org/embulk/parser/regex/TestRegexParserPlugin.java
|
68
|
+
- classpath/embulk-parser-regex-0.2.0.jar
|
71
69
|
homepage: https://github.com/mokemokechicken/embulk-parser-regex
|
72
70
|
licenses:
|
73
71
|
- MIT
|
74
72
|
metadata: {}
|
75
|
-
post_install_message:
|
73
|
+
post_install_message:
|
76
74
|
rdoc_options: []
|
77
75
|
require_paths:
|
78
76
|
- lib
|
79
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
80
78
|
requirements:
|
81
|
-
- -
|
79
|
+
- - '>='
|
82
80
|
- !ruby/object:Gem::Version
|
83
81
|
version: '0'
|
84
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
83
|
requirements:
|
86
|
-
- -
|
84
|
+
- - '>='
|
87
85
|
- !ruby/object:Gem::Version
|
88
86
|
version: '0'
|
89
87
|
requirements: []
|
90
|
-
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
92
|
-
signing_key:
|
88
|
+
rubyforge_project:
|
89
|
+
rubygems_version: 2.1.9
|
90
|
+
signing_key:
|
93
91
|
specification_version: 4
|
94
92
|
summary: Regex parser plugin for Embulk
|
95
93
|
test_files: []
|
data/embulk-parser-regex.iml
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
-
<module external.linked.project.id="embulk-parser-regex" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="GRADLE" external.system.module.group="" external.system.module.version="0.1.0" type="JAVA_MODULE" version="4">
|
3
|
-
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
4
|
-
<output url="file://$MODULE_DIR$/build/classes/main" />
|
5
|
-
<output-test url="file://$MODULE_DIR$/build/classes/test" />
|
6
|
-
<exclude-output />
|
7
|
-
<content url="file://$MODULE_DIR$">
|
8
|
-
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
9
|
-
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
10
|
-
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
11
|
-
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
|
12
|
-
<excludeFolder url="file://$MODULE_DIR$/.gradle" />
|
13
|
-
<excludeFolder url="file://$MODULE_DIR$/build" />
|
14
|
-
</content>
|
15
|
-
<orderEntry type="inheritedJdk" />
|
16
|
-
<orderEntry type="sourceFolder" forTests="false" />
|
17
|
-
<orderEntry type="library" name="Gradle: org.embulk:embulk-core:0.7.4" level="project" />
|
18
|
-
<orderEntry type="library" name="Gradle: com.google.guava:guava:18.0" level="project" />
|
19
|
-
<orderEntry type="library" name="Gradle: com.google.inject:guice:4.0" level="project" />
|
20
|
-
<orderEntry type="library" name="Gradle: com.google.inject.extensions:guice-multibindings:4.0" level="project" />
|
21
|
-
<orderEntry type="library" name="Gradle: javax.inject:javax.inject:1" level="project" />
|
22
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-annotations:2.5.3" level="project" />
|
23
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-core:2.5.3" level="project" />
|
24
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-databind:2.5.3" level="project" />
|
25
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.datatype:jackson-datatype-guava:2.5.3" level="project" />
|
26
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.datatype:jackson-datatype-joda:2.5.3" level="project" />
|
27
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.module:jackson-module-guice:2.5.3" level="project" />
|
28
|
-
<orderEntry type="library" name="Gradle: ch.qos.logback:logback-classic:1.1.3" level="project" />
|
29
|
-
<orderEntry type="library" name="Gradle: org.slf4j:slf4j-api:1.7.12" level="project" />
|
30
|
-
<orderEntry type="library" name="Gradle: org.jruby:jruby-complete:9.0.0.0" level="project" />
|
31
|
-
<orderEntry type="library" name="Gradle: com.google.code.findbugs:annotations:3.0.0" level="project" />
|
32
|
-
<orderEntry type="library" name="Gradle: org.yaml:snakeyaml:1.14" level="project" />
|
33
|
-
<orderEntry type="library" name="Gradle: javax.validation:validation-api:1.1.0.Final" level="project" />
|
34
|
-
<orderEntry type="library" name="Gradle: org.apache.bval:bval-jsr303:0.5" level="project" />
|
35
|
-
<orderEntry type="library" name="Gradle: io.airlift:slice:0.9" level="project" />
|
36
|
-
<orderEntry type="library" name="Gradle: joda-time:joda-time:2.8.1" level="project" />
|
37
|
-
<orderEntry type="library" name="Gradle: io.netty:netty-buffer:5.0.0.Alpha1" level="project" />
|
38
|
-
<orderEntry type="library" name="Gradle: org.fusesource.jansi:jansi:1.11" level="project" />
|
39
|
-
<orderEntry type="library" name="Gradle: com.ibm.icu:icu4j:54.1.1" level="project" />
|
40
|
-
<orderEntry type="library" name="Gradle: aopalliance:aopalliance:1.0" level="project" />
|
41
|
-
<orderEntry type="library" name="Gradle: ch.qos.logback:logback-core:1.1.3" level="project" />
|
42
|
-
<orderEntry type="library" name="Gradle: org.apache.bval:bval-core:0.5" level="project" />
|
43
|
-
<orderEntry type="library" name="Gradle: org.apache.commons:commons-lang3:3.1" level="project" />
|
44
|
-
<orderEntry type="library" name="Gradle: io.netty:netty-common:5.0.0.Alpha1" level="project" />
|
45
|
-
<orderEntry type="library" name="Gradle: commons-beanutils:commons-beanutils-core:1.8.3" level="project" />
|
46
|
-
<orderEntry type="library" scope="TEST" name="Gradle: junit:junit:4.12" level="project" />
|
47
|
-
<orderEntry type="library" scope="TEST" name="Gradle: org.hamcrest:hamcrest-core:1.3" level="project" />
|
48
|
-
</component>
|
49
|
-
</module>
|