embulk-filter-gsub 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -0
- data/build.gradle +2 -1
- data/gradle.properties +1 -1
- data/src/main/kotlin/org/embulk/filter/gsub/LowerCaseReplacerFactory.kt +1 -4
- data/src/main/kotlin/org/embulk/filter/gsub/RegexFactory.kt +32 -0
- data/src/main/kotlin/org/embulk/filter/gsub/RegexOptions.kt +23 -0
- data/src/main/kotlin/org/embulk/filter/gsub/RegexReplacerFactory.kt +1 -4
- data/src/main/kotlin/org/embulk/filter/gsub/SubstitutionRule.kt +4 -0
- data/src/main/kotlin/org/embulk/filter/gsub/UpperCaseReplacerFactory.kt +1 -4
- data/src/test/kotlin/org/embulk/filter/gsub/TestGsubFilterPlugin.kt +48 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/RegexReplacerTest.kt +118 -3
- metadata +7 -6
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexFactory.kt +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2225445bc1080f16f10ccf3b514f4103f70cf39
|
4
|
+
data.tar.gz: 6cb469f097ee14aa27ced7847a9dc787eb0dc211
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e33ff204e96d633f242a975966b547df3353230a40c292131d2eb609e45494a63b2e3213c7874c2c1f7a389ab8adf26e2a88ca2d6de65e435cbde98d2c3830a
|
7
|
+
data.tar.gz: f9818862b57c4f7609ff3f153de35488de1098ea00b6a2a569a9acbdcf5f966ca89c8f7b6ecf6cfa0696b60e69c82e694b38879ab2800d323a44865c3fa84ee9
|
data/README.md
CHANGED
@@ -133,6 +133,26 @@ target_columns:
|
|
133
133
|
to: "$1 = [$2]"
|
134
134
|
```
|
135
135
|
|
136
|
+
### Regular expression options
|
137
|
+
|
138
|
+
You can specify some regular expression options.
|
139
|
+
|
140
|
+
```yaml
|
141
|
+
target_columns:
|
142
|
+
foo:
|
143
|
+
- type: regexp_replace
|
144
|
+
pattern: 'foo'
|
145
|
+
to: "***"
|
146
|
+
regexp_options:
|
147
|
+
ignore_case: true
|
148
|
+
```
|
149
|
+
|
150
|
+
Supported options are:
|
151
|
+
|
152
|
+
* **ignore_case** (boolean, default: false)
|
153
|
+
* **multiline** (boolean, default: true)
|
154
|
+
* **dot_matches_all** (boolean, default: false)
|
155
|
+
* **enable_comments** (boolean, default: false)
|
136
156
|
|
137
157
|
## Build
|
138
158
|
|
data/build.gradle
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
buildscript {
|
2
|
-
ext.kotlin_version = '1.2.
|
2
|
+
ext.kotlin_version = '1.2.21'
|
3
3
|
repositories {
|
4
4
|
mavenCentral()
|
5
5
|
jcenter()
|
@@ -18,6 +18,7 @@ embulk {
|
|
18
18
|
version = "0.8.38"
|
19
19
|
category = "filter"
|
20
20
|
name = "gsub"
|
21
|
+
description = "Embulk filter plugin to convert text column values with regular expressions"
|
21
22
|
authors = ["Sawada Tadashi"]
|
22
23
|
email = "cesare@mayverse.jp"
|
23
24
|
homepage = "https://github.com/cesare/embulk-filter-gsub"
|
data/gradle.properties
CHANGED
@@ -1 +1 @@
|
|
1
|
-
version=0.
|
1
|
+
version=0.2.0
|
@@ -1,7 +1,6 @@
|
|
1
1
|
package org.embulk.filter.gsub
|
2
2
|
|
3
3
|
import org.embulk.filter.gsub.replacer.LowerCaseReplacer
|
4
|
-
import org.embulk.filter.gsub.replacer.RegexFactory
|
5
4
|
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
6
5
|
import org.embulk.filter.gsub.replacer.TextReplacer
|
7
6
|
|
@@ -9,10 +8,8 @@ class LowerCaseReplacerFactory : TextReplacerFactory() {
|
|
9
8
|
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
9
|
val pattern = rule.pattern.orNull()
|
11
10
|
if (pattern != null) {
|
12
|
-
// TODO set regex options
|
13
|
-
val regexOptionConfig = RegexOptionConfig()
|
14
11
|
val factory = RegexFactory()
|
15
|
-
val regex = factory.create(pattern,
|
12
|
+
val regex = factory.create(pattern, rule.regexOptions)
|
16
13
|
|
17
14
|
return LowerCaseReplacer(regex)
|
18
15
|
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.RegexOptions
|
4
|
+
|
5
|
+
class RegexFactory {
|
6
|
+
fun create(patternString: String, regexOptions: RegexOptions): Regex {
|
7
|
+
val options = buildOptions(regexOptions)
|
8
|
+
return Regex(patternString, options)
|
9
|
+
}
|
10
|
+
|
11
|
+
private fun buildOptions(regexOptions: RegexOptions): Set<RegexOption> {
|
12
|
+
val options = HashSet<RegexOption>()
|
13
|
+
|
14
|
+
if (regexOptions.ignoreCase) {
|
15
|
+
options.add(RegexOption.IGNORE_CASE)
|
16
|
+
}
|
17
|
+
|
18
|
+
if (regexOptions.multiline) {
|
19
|
+
options.add(RegexOption.MULTILINE)
|
20
|
+
}
|
21
|
+
|
22
|
+
if (regexOptions.dotMatchesAll) {
|
23
|
+
options.add(RegexOption.DOT_MATCHES_ALL)
|
24
|
+
}
|
25
|
+
|
26
|
+
if (regexOptions.enableComments) {
|
27
|
+
options.add(RegexOption.COMMENTS)
|
28
|
+
}
|
29
|
+
|
30
|
+
return options
|
31
|
+
}
|
32
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.config.Config
|
4
|
+
import org.embulk.config.ConfigDefault
|
5
|
+
import org.embulk.config.Task
|
6
|
+
|
7
|
+
interface RegexOptions : Task {
|
8
|
+
@get:Config("ignore_case")
|
9
|
+
@get:ConfigDefault("false")
|
10
|
+
val ignoreCase: Boolean
|
11
|
+
|
12
|
+
@get:Config("multiline")
|
13
|
+
@get:ConfigDefault("true")
|
14
|
+
val multiline: Boolean
|
15
|
+
|
16
|
+
@get:Config("dot_matches_all")
|
17
|
+
@get:ConfigDefault("false")
|
18
|
+
val dotMatchesAll: Boolean
|
19
|
+
|
20
|
+
@get:Config("enable_comments")
|
21
|
+
@get:ConfigDefault("false")
|
22
|
+
val enableComments: Boolean
|
23
|
+
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.filter.gsub
|
2
2
|
|
3
|
-
import org.embulk.filter.gsub.replacer.RegexFactory
|
4
3
|
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
5
4
|
import org.embulk.filter.gsub.replacer.RegexReplacer
|
6
5
|
import org.embulk.filter.gsub.replacer.TextReplacer
|
@@ -10,10 +9,8 @@ class RegexReplacerFactory : TextReplacerFactory() {
|
|
10
9
|
val pattern = rule.pattern.get()
|
11
10
|
val to = rule.to.get()
|
12
11
|
|
13
|
-
val regexOptionConfig = RegexOptionConfig()
|
14
|
-
|
15
12
|
val factory = RegexFactory()
|
16
|
-
val regex = factory.create(pattern,
|
13
|
+
val regex = factory.create(pattern, rule.regexOptions)
|
17
14
|
|
18
15
|
return RegexReplacer(regex, to)
|
19
16
|
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.filter.gsub
|
2
2
|
|
3
|
-
import org.embulk.filter.gsub.replacer.RegexFactory
|
4
3
|
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
5
4
|
import org.embulk.filter.gsub.replacer.TextReplacer
|
6
5
|
import org.embulk.filter.gsub.replacer.UpperCaseReplacer
|
@@ -9,10 +8,8 @@ class UpperCaseReplacerFactory : TextReplacerFactory() {
|
|
9
8
|
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
9
|
val pattern = rule.pattern.orNull()
|
11
10
|
if (pattern != null) {
|
12
|
-
// TODO set regex options
|
13
|
-
val regexOptionConfig = RegexOptionConfig()
|
14
11
|
val factory = RegexFactory()
|
15
|
-
val regex = factory.create(pattern,
|
12
|
+
val regex = factory.create(pattern, rule.regexOptions)
|
16
13
|
|
17
14
|
return UpperCaseReplacer(regex)
|
18
15
|
}
|
@@ -57,6 +57,54 @@ class TestGsubFilterPlugin {
|
|
57
57
|
Assert.assertEquals("\\1 [\\2]", barRule2.to.get())
|
58
58
|
}
|
59
59
|
|
60
|
+
@Test
|
61
|
+
fun testDefaultRegexOptions() {
|
62
|
+
val configYaml = """
|
63
|
+
|type: gsub
|
64
|
+
|target_columns:
|
65
|
+
| foo:
|
66
|
+
| - type: regexp_replace
|
67
|
+
| pattern: "test"
|
68
|
+
""".trimMargin()
|
69
|
+
|
70
|
+
val config = getConfigFromYaml(configYaml)
|
71
|
+
val task = config.loadConfig(GsubFilterPlugin.PluginTask::class.java)
|
72
|
+
val fooRules = task.targetColumns["foo"]!!
|
73
|
+
val fooRule = fooRules[0]
|
74
|
+
val regexOptions = fooRule.regexOptions
|
75
|
+
Assert.assertFalse(regexOptions.ignoreCase)
|
76
|
+
Assert.assertTrue(regexOptions.multiline)
|
77
|
+
Assert.assertFalse(regexOptions.dotMatchesAll)
|
78
|
+
Assert.assertFalse(regexOptions.enableComments)
|
79
|
+
}
|
80
|
+
|
81
|
+
@Test
|
82
|
+
fun testRegexOptions() {
|
83
|
+
val configYaml = """
|
84
|
+
|type: gsub
|
85
|
+
|target_columns:
|
86
|
+
| foo:
|
87
|
+
| - type: regexp_replace
|
88
|
+
| pattern: "test"
|
89
|
+
| regexp_options:
|
90
|
+
| ignore_case: true
|
91
|
+
| multiline: true
|
92
|
+
| dot_matches_all: true
|
93
|
+
| enable_comments: true
|
94
|
+
""".trimMargin()
|
95
|
+
|
96
|
+
val config = getConfigFromYaml(configYaml)
|
97
|
+
val task = config.loadConfig(GsubFilterPlugin.PluginTask::class.java)
|
98
|
+
val fooRules = task.targetColumns["foo"]!!
|
99
|
+
val fooRule = fooRules[0]
|
100
|
+
val regexOptions = fooRule.regexOptions
|
101
|
+
|
102
|
+
Assert.assertTrue(regexOptions.ignoreCase)
|
103
|
+
Assert.assertTrue(regexOptions.multiline)
|
104
|
+
Assert.assertTrue(regexOptions.dotMatchesAll)
|
105
|
+
Assert.assertTrue(regexOptions.enableComments)
|
106
|
+
}
|
107
|
+
|
60
108
|
@Test
|
61
109
|
fun testEmptyFilter() {
|
62
110
|
val configYaml = """
|
@@ -1,19 +1,134 @@
|
|
1
1
|
package org.embulk.filter.gsub.replacer
|
2
2
|
|
3
|
+
import org.embulk.config.TaskSource
|
4
|
+
import org.embulk.filter.gsub.RegexFactory
|
5
|
+
import org.embulk.filter.gsub.RegexOptions
|
3
6
|
import org.junit.Assert
|
4
7
|
import org.junit.Test
|
5
8
|
|
6
9
|
class RegexReplacerTest {
|
7
10
|
@Test
|
8
11
|
fun testExecute() {
|
9
|
-
val
|
10
|
-
optionConfig.ignoreCase = true
|
12
|
+
val regexOptions = createRegexpOption()
|
11
13
|
|
12
14
|
val factory = RegexFactory()
|
13
|
-
val pattern = factory.create("(\\w*):\\s*(.*)",
|
15
|
+
val pattern = factory.create("(\\w*):\\s*(.*)", regexOptions)
|
14
16
|
|
15
17
|
val replacer = RegexReplacer(pattern, "$1 [$2]")
|
16
18
|
val result = replacer.execute("test: foo bar baz")
|
17
19
|
Assert.assertEquals("test [foo bar baz]", result)
|
18
20
|
}
|
21
|
+
|
22
|
+
@Test
|
23
|
+
fun testExecuteWithoutIgnoreCaseOption() {
|
24
|
+
val regexOptions = createRegexpOption(ignoreCase = false)
|
25
|
+
|
26
|
+
val factory = RegexFactory()
|
27
|
+
val pattern = factory.create("foo", regexOptions)
|
28
|
+
|
29
|
+
val replacer = RegexReplacer(pattern, "*test-foo*")
|
30
|
+
|
31
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("foo bar baz"))
|
32
|
+
Assert.assertEquals("Foo bar baz", replacer.execute("Foo bar baz"))
|
33
|
+
Assert.assertEquals("FOO bar baz", replacer.execute("FOO bar baz"))
|
34
|
+
}
|
35
|
+
|
36
|
+
@Test
|
37
|
+
fun testExecuteWithIgnoreCaseOption() {
|
38
|
+
val regexOptions = createRegexpOption(ignoreCase = true)
|
39
|
+
val factory = RegexFactory()
|
40
|
+
val pattern = factory.create("foo", regexOptions)
|
41
|
+
|
42
|
+
val replacer = RegexReplacer(pattern, "*test-foo*")
|
43
|
+
|
44
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("foo bar baz"))
|
45
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("Foo bar baz"))
|
46
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("FOO bar baz"))
|
47
|
+
}
|
48
|
+
|
49
|
+
@Test
|
50
|
+
fun testExecuteWithoutMultilineOption() {
|
51
|
+
val regexOptions = createRegexpOption(multiline = false)
|
52
|
+
val factory = RegexFactory()
|
53
|
+
val pattern = factory.create("^bar", regexOptions)
|
54
|
+
|
55
|
+
val replacer = RegexReplacer(pattern, "*BAR*")
|
56
|
+
|
57
|
+
Assert.assertEquals("foo\nbar\nbaz", replacer.execute("foo\nbar\nbaz"))
|
58
|
+
}
|
59
|
+
|
60
|
+
@Test
|
61
|
+
fun testExecuteWithMultilineOption() {
|
62
|
+
val regexOptions = createRegexpOption(multiline = true)
|
63
|
+
val factory = RegexFactory()
|
64
|
+
val pattern = factory.create("^bar", regexOptions)
|
65
|
+
|
66
|
+
val replacer = RegexReplacer(pattern, "*BAR*")
|
67
|
+
|
68
|
+
Assert.assertEquals("foo\n*BAR*\nbaz", replacer.execute("foo\nbar\nbaz"))
|
69
|
+
}
|
70
|
+
|
71
|
+
@Test
|
72
|
+
fun testExecuteWithoutDotMatchesAllOption() {
|
73
|
+
val regexOptions = createRegexpOption(dotMatchesAll = false)
|
74
|
+
val factory = RegexFactory()
|
75
|
+
val pattern = factory.create("foo.bar.baz", regexOptions)
|
76
|
+
|
77
|
+
val replacer = RegexReplacer(pattern, "[foo-bar-baz]")
|
78
|
+
|
79
|
+
Assert.assertEquals("[foo-bar-baz]", replacer.execute("foo/bar/baz"))
|
80
|
+
Assert.assertEquals("foo\nbar/baz", replacer.execute("foo\nbar/baz"))
|
81
|
+
}
|
82
|
+
|
83
|
+
@Test
|
84
|
+
fun testExecuteWithDotMatchesAllOption() {
|
85
|
+
val regexOptions = createRegexpOption(dotMatchesAll = true)
|
86
|
+
val factory = RegexFactory()
|
87
|
+
val pattern = factory.create("foo.bar.baz", regexOptions)
|
88
|
+
|
89
|
+
val replacer = RegexReplacer(pattern, "[foo-bar-baz]")
|
90
|
+
|
91
|
+
Assert.assertEquals("[foo-bar-baz]", replacer.execute("foo/bar/baz"))
|
92
|
+
Assert.assertEquals("[foo-bar-baz]", replacer.execute("foo\nbar/baz"))
|
93
|
+
}
|
94
|
+
|
95
|
+
@Test
|
96
|
+
fun testExecuteWithEnableCommentsOption() {
|
97
|
+
val regexOptions = createRegexpOption(enableComments = true)
|
98
|
+
val factory = RegexFactory()
|
99
|
+
|
100
|
+
val patternString = """
|
101
|
+
|(ba\w) # matches bar and baz
|
102
|
+
""".trimMargin()
|
103
|
+
val pattern = factory.create(patternString, regexOptions)
|
104
|
+
|
105
|
+
val replacer = RegexReplacer(pattern, "*$1*")
|
106
|
+
|
107
|
+
Assert.assertEquals("foo *bar* *baz*", replacer.execute("foo bar baz"))
|
108
|
+
}
|
109
|
+
|
110
|
+
private fun createRegexpOption(
|
111
|
+
ignoreCase: Boolean = false,
|
112
|
+
multiline: Boolean = true,
|
113
|
+
dotMatchesAll: Boolean = false,
|
114
|
+
enableComments: Boolean = false
|
115
|
+
): RegexOptions {
|
116
|
+
return object: RegexOptions {
|
117
|
+
override val ignoreCase: Boolean
|
118
|
+
get() = ignoreCase
|
119
|
+
override val multiline: Boolean
|
120
|
+
get() = multiline
|
121
|
+
override val dotMatchesAll: Boolean
|
122
|
+
get() = dotMatchesAll
|
123
|
+
override val enableComments: Boolean
|
124
|
+
get() = enableComments
|
125
|
+
|
126
|
+
override fun validate() {
|
127
|
+
}
|
128
|
+
|
129
|
+
override fun dump(): TaskSource {
|
130
|
+
throw NotImplementedError()
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
19
134
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-gsub
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sawada Tadashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-02-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description:
|
41
|
+
description: Embulk filter plugin to convert text column values with regular expressions
|
42
42
|
email:
|
43
43
|
- cesare@mayverse.jp
|
44
44
|
executables: []
|
@@ -60,13 +60,14 @@ files:
|
|
60
60
|
- src/main/kotlin/org/embulk/filter/gsub/ColumnVisitorImpl.kt
|
61
61
|
- src/main/kotlin/org/embulk/filter/gsub/GsubFilterPlugin.kt
|
62
62
|
- src/main/kotlin/org/embulk/filter/gsub/LowerCaseReplacerFactory.kt
|
63
|
+
- src/main/kotlin/org/embulk/filter/gsub/RegexFactory.kt
|
64
|
+
- src/main/kotlin/org/embulk/filter/gsub/RegexOptions.kt
|
63
65
|
- src/main/kotlin/org/embulk/filter/gsub/RegexReplacerFactory.kt
|
64
66
|
- src/main/kotlin/org/embulk/filter/gsub/SubstitutionRule.kt
|
65
67
|
- src/main/kotlin/org/embulk/filter/gsub/TextReplacerFactory.kt
|
66
68
|
- src/main/kotlin/org/embulk/filter/gsub/UpperCaseReplacerFactory.kt
|
67
69
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/CombinedReplacer.kt
|
68
70
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacer.kt
|
69
|
-
- src/main/kotlin/org/embulk/filter/gsub/replacer/RegexFactory.kt
|
70
71
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/RegexOptionConfig.kt
|
71
72
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/RegexReplacer.kt
|
72
73
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/TextReplacer.kt
|
@@ -76,8 +77,8 @@ files:
|
|
76
77
|
- src/test/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacerTest.kt
|
77
78
|
- src/test/kotlin/org/embulk/filter/gsub/replacer/RegexReplacerTest.kt
|
78
79
|
- classpath/annotations-13.0.jar
|
79
|
-
- classpath/embulk-filter-gsub-0.
|
80
|
-
- classpath/kotlin-stdlib-1.2.
|
80
|
+
- classpath/embulk-filter-gsub-0.2.0.jar
|
81
|
+
- classpath/kotlin-stdlib-1.2.21.jar
|
81
82
|
homepage: https://github.com/cesare/embulk-filter-gsub
|
82
83
|
licenses:
|
83
84
|
- MIT
|
@@ -1,22 +0,0 @@
|
|
1
|
-
package org.embulk.filter.gsub.replacer
|
2
|
-
|
3
|
-
class RegexFactory {
|
4
|
-
fun create(patternString: String, regexOptionConfig: RegexOptionConfig): Regex {
|
5
|
-
val options = buildOptions(regexOptionConfig)
|
6
|
-
return Regex(patternString, options)
|
7
|
-
}
|
8
|
-
|
9
|
-
private fun buildOptions(optionConfig: RegexOptionConfig): Set<RegexOption> {
|
10
|
-
val options = HashSet<RegexOption>()
|
11
|
-
|
12
|
-
if (optionConfig.ignoreCase) {
|
13
|
-
options.add(RegexOption.IGNORE_CASE)
|
14
|
-
}
|
15
|
-
|
16
|
-
if (optionConfig.multiline) {
|
17
|
-
options.add(RegexOption.MULTILINE)
|
18
|
-
}
|
19
|
-
|
20
|
-
return options
|
21
|
-
}
|
22
|
-
}
|