embulk-parser-regex 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +5 -5
- data/build.gradle +1 -1
- data/lib/embulk/guess/regex.rb +12 -12
- data/sample/apache_1/config.yml +4 -4
- data/sample/apache_2/config.yml +3 -3
- data/src/main/java/org/embulk/parser/regex/RegexParserPlugin.java +2 -2
- metadata +23 -25
- data/embulk-parser-regex.iml +0 -49
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 180e9d8ef6ff872d1aa840f937972ba228b2762c
|
4
|
+
data.tar.gz: 5ae14fb13a5ac890f8f4c7224cf14fbe660ddb22
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bf0f5d97601bd217bc28b955d1046456d9231c9cbf7614c82fe90dfe70fb43716ce09ab3762e0bf2737ffc9a1e6774e23086e7a61c2b5526afc284619df89a0d
|
7
|
+
data.tar.gz: 7e1a23835f82c112616998ce5c121ddac25299ac0d91afb934b6a5f8218b295a561a7842446b261a79a83dea9d267dca1d60f827e3a72e99675c824b4ee40a7d
|
data/README.md
CHANGED
@@ -11,7 +11,7 @@ A simple parser Using Regular Expression.
|
|
11
11
|
|
12
12
|
- **regex**: regular expression that must use [Named Capturing Group](https://blogs.oracle.com/xuemingshen/entry/named_capturing_group_in_jdk7) (string, required)
|
13
13
|
- **columns**: column definition (list of object)
|
14
|
-
- **
|
14
|
+
- **regex_name**: 'Named Capturing Group' can only include `[a-zA-Z0-9]`, so alias group name in regex can be specified (string, default: `<name> attr value`)
|
15
15
|
- **skip_if_unmatch**: if false, when a line don't match the regex, raise RuntimeException. If true, skip the line. (boolean, default: `false`)
|
16
16
|
|
17
17
|
## Example
|
@@ -23,7 +23,7 @@ in:
|
|
23
23
|
type: regex
|
24
24
|
regex: ^(?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)" (?<inByte>[0-9]+) (?<outByte>[0-9]+)$
|
25
25
|
columns:
|
26
|
-
- {name: remote_host, type: string,
|
26
|
+
- {name: remote_host, type: string, regex_name: remoteHost}
|
27
27
|
- {name: identity, type: string}
|
28
28
|
- {name: user, type: string}
|
29
29
|
- {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
|
@@ -33,9 +33,9 @@ in:
|
|
33
33
|
- {name: status, type: long}
|
34
34
|
- {name: size, type: long}
|
35
35
|
- {name: referer, type: string}
|
36
|
-
- {name: user_agent, type: string,
|
37
|
-
- {name: in_byte, type: long,
|
38
|
-
- {name: out_byte, type: long,
|
36
|
+
- {name: user_agent, type: string, regex_name: userAgent}
|
37
|
+
- {name: in_byte, type: long, regex_name: inByte}
|
38
|
+
- {name: out_byte, type: long, regex_name: outByte}
|
39
39
|
```
|
40
40
|
|
41
41
|
### Guess
|
data/build.gradle
CHANGED
data/lib/embulk/guess/regex.rb
CHANGED
@@ -26,12 +26,12 @@ module Embulk
|
|
26
26
|
|
27
27
|
def apache_x_forwarded_for
|
28
28
|
RegexApacheLogGuesser.new
|
29
|
-
.ip_or_minus(:x_forwarded_for,
|
29
|
+
.ip_or_minus(:x_forwarded_for, regex_name: 'forwardedFor')
|
30
30
|
end
|
31
31
|
|
32
32
|
def apache_common(config, sample_lines)
|
33
33
|
RegexApacheLogGuesser.new
|
34
|
-
.ip(:remote_host,
|
34
|
+
.ip(:remote_host, regex_name: 'remoteHost').token(:identity).token(:user)
|
35
35
|
.kakko(:datetime, format: '%d/%b/%Y:%H:%M:%S %z', type: 'timestamp')
|
36
36
|
.method_path_protocol
|
37
37
|
.integer(:status).integer_or_minus(:size)
|
@@ -39,12 +39,12 @@ module Embulk
|
|
39
39
|
|
40
40
|
def apache_combined(config, sample_lines)
|
41
41
|
apache_common(config, sample_lines)
|
42
|
-
.string(:referer).string(:user_agent,
|
42
|
+
.string(:referer).string(:user_agent, regex_name: 'userAgent')
|
43
43
|
end
|
44
44
|
|
45
45
|
def apache_combinedio(config, sample_lines)
|
46
46
|
apache_combined(config, sample_lines)
|
47
|
-
.integer(:in_byte,
|
47
|
+
.integer(:in_byte, regex_name: 'inByte').integer(:out_byte, regex_name: 'outByte')
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
@@ -82,49 +82,49 @@ module Embulk
|
|
82
82
|
end
|
83
83
|
|
84
84
|
def ip(name, opts={})
|
85
|
-
@patterns << "(?<#{opts[:
|
85
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[.:0-9]+)"
|
86
86
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
87
87
|
self
|
88
88
|
end
|
89
89
|
|
90
90
|
def ip_or_minus(name, opts={})
|
91
|
-
@patterns << "(?<#{opts[:
|
91
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[.:0-9]+|-)"
|
92
92
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
93
93
|
self
|
94
94
|
end
|
95
95
|
|
96
96
|
def token(name, opts={})
|
97
|
-
@patterns << "(?<#{opts[:
|
97
|
+
@patterns << "(?<#{opts[:regex_name] || name}>\\S+)"
|
98
98
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
99
99
|
self
|
100
100
|
end
|
101
101
|
|
102
102
|
def string(name, opts={})
|
103
|
-
@patterns << "\"(?<#{opts[:
|
103
|
+
@patterns << "\"(?<#{opts[:regex_name] || name}>[^\"]*)\""
|
104
104
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
105
105
|
self
|
106
106
|
end
|
107
107
|
|
108
108
|
def string_or_minus(name, opts={})
|
109
|
-
@patterns << "\"(?<#{opts[:
|
109
|
+
@patterns << "\"(?<#{opts[:regex_name] || name}>[^\"]*|-)\""
|
110
110
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
111
111
|
self
|
112
112
|
end
|
113
113
|
|
114
114
|
def integer(name, opts={})
|
115
|
-
@patterns << "(?<#{opts[:
|
115
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[0-9]+)"
|
116
116
|
@columns << {:name => name, :type => 'long'}.merge(opts)
|
117
117
|
self
|
118
118
|
end
|
119
119
|
|
120
120
|
def integer_or_minus(name, opts={})
|
121
|
-
@patterns << "(?<#{opts[:
|
121
|
+
@patterns << "(?<#{opts[:regex_name] || name}>[0-9]+|-)"
|
122
122
|
@columns << {:name => name, :type => 'long'}.merge(opts)
|
123
123
|
self
|
124
124
|
end
|
125
125
|
|
126
126
|
def kakko(name, opts={})
|
127
|
-
@patterns << "\\[(?<#{opts[:
|
127
|
+
@patterns << "\\[(?<#{opts[:regex_name] || name}>[^\\]]*)\\]"
|
128
128
|
@columns << {:name => name, :type => 'string'}.merge(opts)
|
129
129
|
self
|
130
130
|
end
|
data/sample/apache_1/config.yml
CHANGED
@@ -7,7 +7,7 @@ in:
|
|
7
7
|
type: regex
|
8
8
|
regex: ^(?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)" (?<inByte>[0-9]+) (?<outByte>[0-9]+)$
|
9
9
|
columns:
|
10
|
-
- {name: remote_host, type: string,
|
10
|
+
- {name: remote_host, type: string, regex_name: remoteHost}
|
11
11
|
- {name: identity, type: string}
|
12
12
|
- {name: user, type: string}
|
13
13
|
- {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
|
@@ -17,7 +17,7 @@ in:
|
|
17
17
|
- {name: status, type: long}
|
18
18
|
- {name: size, type: long}
|
19
19
|
- {name: referer, type: string}
|
20
|
-
- {name: user_agent, type: string,
|
21
|
-
- {name: in_byte, type: long,
|
22
|
-
- {name: out_byte, type: long,
|
20
|
+
- {name: user_agent, type: string, regex_name: userAgent}
|
21
|
+
- {name: in_byte, type: long, regex_name: inByte}
|
22
|
+
- {name: out_byte, type: long, regex_name: outByte}
|
23
23
|
out: {type: stdout}
|
data/sample/apache_2/config.yml
CHANGED
@@ -7,8 +7,8 @@ in:
|
|
7
7
|
type: regex
|
8
8
|
regex: ^(?<forwardedFor>[.:0-9]+|-) (?<remoteHost>[.:0-9]+) (?<identity>\S+) (?<user>\S+) \[(?<datetime>[^\]]*)\] "((?<method>\S+) (?<path>\S+) (?<protocol>HTTP/\d+\.\d+)|-)" (?<status>[0-9]+) (?<size>[0-9]+|-) "(?<referer>[^"]*)" "(?<userAgent>[^"]*)"$
|
9
9
|
columns:
|
10
|
-
- {name: x_forwarded_for, type: string,
|
11
|
-
- {name: remote_host, type: string,
|
10
|
+
- {name: x_forwarded_for, type: string, regex_name: forwardedFor}
|
11
|
+
- {name: remote_host, type: string, regex_name: remoteHost}
|
12
12
|
- {name: identity, type: string}
|
13
13
|
- {name: user, type: string}
|
14
14
|
- {name: datetime, type: timestamp, format: '%d/%b/%Y:%H:%M:%S %z'}
|
@@ -18,5 +18,5 @@ in:
|
|
18
18
|
- {name: status, type: long}
|
19
19
|
- {name: size, type: long}
|
20
20
|
- {name: referer, type: string}
|
21
|
-
- {name: user_agent, type: string,
|
21
|
+
- {name: user_agent, type: string, regex_name: userAgent}
|
22
22
|
out: {type: stdout}
|
@@ -70,7 +70,7 @@ public class RegexParserPlugin implements ParserPlugin {
|
|
70
70
|
// TODO: How to Log?
|
71
71
|
continue;
|
72
72
|
} else {
|
73
|
-
throw new
|
73
|
+
throw new DataException("Unmatched Line: " + line);
|
74
74
|
}
|
75
75
|
}
|
76
76
|
|
@@ -100,7 +100,7 @@ public class RegexParserPlugin implements ParserPlugin {
|
|
100
100
|
String name = c.getName();
|
101
101
|
Type type = c.getType();
|
102
102
|
Column column = c.toColumn(index);
|
103
|
-
String regexName = c.getOption().get(String.class, "
|
103
|
+
String regexName = c.getOption().get(String.class, "regex_name", name);
|
104
104
|
|
105
105
|
DefaultValueSetter defaultValue = new NullDefaultValueSetter();
|
106
106
|
DynamicColumnSetter setter;
|
metadata
CHANGED
@@ -1,57 +1,54 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-parser-regex
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ken Morishita
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
date: 2015-08-31 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
15
|
-
|
15
|
+
version_requirements: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ~>
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '1.0'
|
20
|
-
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
23
21
|
requirements:
|
24
|
-
- -
|
22
|
+
- - ~>
|
25
23
|
- !ruby/object:Gem::Version
|
26
24
|
version: '1.0'
|
25
|
+
prerelease: false
|
26
|
+
type: :development
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
|
-
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - ~>
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '10.0'
|
34
|
-
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
34
|
+
requirement: !ruby/object:Gem::Requirement
|
37
35
|
requirements:
|
38
|
-
- -
|
36
|
+
- - ~>
|
39
37
|
- !ruby/object:Gem::Version
|
40
38
|
version: '10.0'
|
41
|
-
|
42
|
-
|
39
|
+
prerelease: false
|
40
|
+
type: :development
|
41
|
+
description: Parses lines using regular-expression in files read by other file input plugins.
|
43
42
|
email:
|
44
43
|
- mokemokechicken@gmail.com
|
45
44
|
executables: []
|
46
45
|
extensions: []
|
47
46
|
extra_rdoc_files: []
|
48
47
|
files:
|
49
|
-
-
|
48
|
+
- .gitignore
|
50
49
|
- LICENSE.txt
|
51
50
|
- README.md
|
52
51
|
- build.gradle
|
53
|
-
- classpath/embulk-parser-regex-0.1.0.jar
|
54
|
-
- embulk-parser-regex.iml
|
55
52
|
- gradle/wrapper/gradle-wrapper.jar
|
56
53
|
- gradle/wrapper/gradle-wrapper.properties
|
57
54
|
- gradlew
|
@@ -68,28 +65,29 @@ files:
|
|
68
65
|
- sample/simple/data_simple_1.txt
|
69
66
|
- src/main/java/org/embulk/parser/regex/RegexParserPlugin.java
|
70
67
|
- src/test/java/org/embulk/parser/regex/TestRegexParserPlugin.java
|
68
|
+
- classpath/embulk-parser-regex-0.2.0.jar
|
71
69
|
homepage: https://github.com/mokemokechicken/embulk-parser-regex
|
72
70
|
licenses:
|
73
71
|
- MIT
|
74
72
|
metadata: {}
|
75
|
-
post_install_message:
|
73
|
+
post_install_message:
|
76
74
|
rdoc_options: []
|
77
75
|
require_paths:
|
78
76
|
- lib
|
79
77
|
required_ruby_version: !ruby/object:Gem::Requirement
|
80
78
|
requirements:
|
81
|
-
- -
|
79
|
+
- - '>='
|
82
80
|
- !ruby/object:Gem::Version
|
83
81
|
version: '0'
|
84
82
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
85
83
|
requirements:
|
86
|
-
- -
|
84
|
+
- - '>='
|
87
85
|
- !ruby/object:Gem::Version
|
88
86
|
version: '0'
|
89
87
|
requirements: []
|
90
|
-
rubyforge_project:
|
91
|
-
rubygems_version: 2.
|
92
|
-
signing_key:
|
88
|
+
rubyforge_project:
|
89
|
+
rubygems_version: 2.1.9
|
90
|
+
signing_key:
|
93
91
|
specification_version: 4
|
94
92
|
summary: Regex parser plugin for Embulk
|
95
93
|
test_files: []
|
data/embulk-parser-regex.iml
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
-
<module external.linked.project.id="embulk-parser-regex" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="GRADLE" external.system.module.group="" external.system.module.version="0.1.0" type="JAVA_MODULE" version="4">
|
3
|
-
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
4
|
-
<output url="file://$MODULE_DIR$/build/classes/main" />
|
5
|
-
<output-test url="file://$MODULE_DIR$/build/classes/test" />
|
6
|
-
<exclude-output />
|
7
|
-
<content url="file://$MODULE_DIR$">
|
8
|
-
<sourceFolder url="file://$MODULE_DIR$/src/main/java" isTestSource="false" />
|
9
|
-
<sourceFolder url="file://$MODULE_DIR$/src/test/java" isTestSource="true" />
|
10
|
-
<sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" />
|
11
|
-
<sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" />
|
12
|
-
<excludeFolder url="file://$MODULE_DIR$/.gradle" />
|
13
|
-
<excludeFolder url="file://$MODULE_DIR$/build" />
|
14
|
-
</content>
|
15
|
-
<orderEntry type="inheritedJdk" />
|
16
|
-
<orderEntry type="sourceFolder" forTests="false" />
|
17
|
-
<orderEntry type="library" name="Gradle: org.embulk:embulk-core:0.7.4" level="project" />
|
18
|
-
<orderEntry type="library" name="Gradle: com.google.guava:guava:18.0" level="project" />
|
19
|
-
<orderEntry type="library" name="Gradle: com.google.inject:guice:4.0" level="project" />
|
20
|
-
<orderEntry type="library" name="Gradle: com.google.inject.extensions:guice-multibindings:4.0" level="project" />
|
21
|
-
<orderEntry type="library" name="Gradle: javax.inject:javax.inject:1" level="project" />
|
22
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-annotations:2.5.3" level="project" />
|
23
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-core:2.5.3" level="project" />
|
24
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.core:jackson-databind:2.5.3" level="project" />
|
25
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.datatype:jackson-datatype-guava:2.5.3" level="project" />
|
26
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.datatype:jackson-datatype-joda:2.5.3" level="project" />
|
27
|
-
<orderEntry type="library" name="Gradle: com.fasterxml.jackson.module:jackson-module-guice:2.5.3" level="project" />
|
28
|
-
<orderEntry type="library" name="Gradle: ch.qos.logback:logback-classic:1.1.3" level="project" />
|
29
|
-
<orderEntry type="library" name="Gradle: org.slf4j:slf4j-api:1.7.12" level="project" />
|
30
|
-
<orderEntry type="library" name="Gradle: org.jruby:jruby-complete:9.0.0.0" level="project" />
|
31
|
-
<orderEntry type="library" name="Gradle: com.google.code.findbugs:annotations:3.0.0" level="project" />
|
32
|
-
<orderEntry type="library" name="Gradle: org.yaml:snakeyaml:1.14" level="project" />
|
33
|
-
<orderEntry type="library" name="Gradle: javax.validation:validation-api:1.1.0.Final" level="project" />
|
34
|
-
<orderEntry type="library" name="Gradle: org.apache.bval:bval-jsr303:0.5" level="project" />
|
35
|
-
<orderEntry type="library" name="Gradle: io.airlift:slice:0.9" level="project" />
|
36
|
-
<orderEntry type="library" name="Gradle: joda-time:joda-time:2.8.1" level="project" />
|
37
|
-
<orderEntry type="library" name="Gradle: io.netty:netty-buffer:5.0.0.Alpha1" level="project" />
|
38
|
-
<orderEntry type="library" name="Gradle: org.fusesource.jansi:jansi:1.11" level="project" />
|
39
|
-
<orderEntry type="library" name="Gradle: com.ibm.icu:icu4j:54.1.1" level="project" />
|
40
|
-
<orderEntry type="library" name="Gradle: aopalliance:aopalliance:1.0" level="project" />
|
41
|
-
<orderEntry type="library" name="Gradle: ch.qos.logback:logback-core:1.1.3" level="project" />
|
42
|
-
<orderEntry type="library" name="Gradle: org.apache.bval:bval-core:0.5" level="project" />
|
43
|
-
<orderEntry type="library" name="Gradle: org.apache.commons:commons-lang3:3.1" level="project" />
|
44
|
-
<orderEntry type="library" name="Gradle: io.netty:netty-common:5.0.0.Alpha1" level="project" />
|
45
|
-
<orderEntry type="library" name="Gradle: commons-beanutils:commons-beanutils-core:1.8.3" level="project" />
|
46
|
-
<orderEntry type="library" scope="TEST" name="Gradle: junit:junit:4.12" level="project" />
|
47
|
-
<orderEntry type="library" scope="TEST" name="Gradle: org.hamcrest:hamcrest-core:1.3" level="project" />
|
48
|
-
</component>
|
49
|
-
</module>
|