embulk-filter-gsub 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -0
- data/build.gradle +2 -1
- data/gradle.properties +1 -1
- data/src/main/kotlin/org/embulk/filter/gsub/LowerCaseReplacerFactory.kt +1 -4
- data/src/main/kotlin/org/embulk/filter/gsub/RegexFactory.kt +32 -0
- data/src/main/kotlin/org/embulk/filter/gsub/RegexOptions.kt +23 -0
- data/src/main/kotlin/org/embulk/filter/gsub/RegexReplacerFactory.kt +1 -4
- data/src/main/kotlin/org/embulk/filter/gsub/SubstitutionRule.kt +4 -0
- data/src/main/kotlin/org/embulk/filter/gsub/UpperCaseReplacerFactory.kt +1 -4
- data/src/test/kotlin/org/embulk/filter/gsub/TestGsubFilterPlugin.kt +48 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/RegexReplacerTest.kt +118 -3
- metadata +7 -6
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexFactory.kt +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f2225445bc1080f16f10ccf3b514f4103f70cf39
|
4
|
+
data.tar.gz: 6cb469f097ee14aa27ced7847a9dc787eb0dc211
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7e33ff204e96d633f242a975966b547df3353230a40c292131d2eb609e45494a63b2e3213c7874c2c1f7a389ab8adf26e2a88ca2d6de65e435cbde98d2c3830a
|
7
|
+
data.tar.gz: f9818862b57c4f7609ff3f153de35488de1098ea00b6a2a569a9acbdcf5f966ca89c8f7b6ecf6cfa0696b60e69c82e694b38879ab2800d323a44865c3fa84ee9
|
data/README.md
CHANGED
@@ -133,6 +133,26 @@ target_columns:
|
|
133
133
|
to: "$1 = [$2]"
|
134
134
|
```
|
135
135
|
|
136
|
+
### Regular expression options
|
137
|
+
|
138
|
+
You can specify some regular expression options.
|
139
|
+
|
140
|
+
```yaml
|
141
|
+
target_columns:
|
142
|
+
foo:
|
143
|
+
- type: regexp_replace
|
144
|
+
pattern: 'foo'
|
145
|
+
to: "***"
|
146
|
+
regexp_options:
|
147
|
+
ignore_case: true
|
148
|
+
```
|
149
|
+
|
150
|
+
Supported options are:
|
151
|
+
|
152
|
+
* **ignore_case** (boolean, default: false)
|
153
|
+
* **multiline** (boolean, default: true)
|
154
|
+
* **dot_matches_all** (boolean, default: false)
|
155
|
+
* **enable_comments** (boolean, default: false)
|
136
156
|
|
137
157
|
## Build
|
138
158
|
|
data/build.gradle
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
buildscript {
|
2
|
-
ext.kotlin_version = '1.2.
|
2
|
+
ext.kotlin_version = '1.2.21'
|
3
3
|
repositories {
|
4
4
|
mavenCentral()
|
5
5
|
jcenter()
|
@@ -18,6 +18,7 @@ embulk {
|
|
18
18
|
version = "0.8.38"
|
19
19
|
category = "filter"
|
20
20
|
name = "gsub"
|
21
|
+
description = "Embulk filter plugin to convert text column values with regular expressions"
|
21
22
|
authors = ["Sawada Tadashi"]
|
22
23
|
email = "cesare@mayverse.jp"
|
23
24
|
homepage = "https://github.com/cesare/embulk-filter-gsub"
|
data/gradle.properties
CHANGED
@@ -1 +1 @@
|
|
1
|
-
version=0.
|
1
|
+
version=0.2.0
|
@@ -1,7 +1,6 @@
|
|
1
1
|
package org.embulk.filter.gsub
|
2
2
|
|
3
3
|
import org.embulk.filter.gsub.replacer.LowerCaseReplacer
|
4
|
-
import org.embulk.filter.gsub.replacer.RegexFactory
|
5
4
|
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
6
5
|
import org.embulk.filter.gsub.replacer.TextReplacer
|
7
6
|
|
@@ -9,10 +8,8 @@ class LowerCaseReplacerFactory : TextReplacerFactory() {
|
|
9
8
|
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
9
|
val pattern = rule.pattern.orNull()
|
11
10
|
if (pattern != null) {
|
12
|
-
// TODO set regex options
|
13
|
-
val regexOptionConfig = RegexOptionConfig()
|
14
11
|
val factory = RegexFactory()
|
15
|
-
val regex = factory.create(pattern,
|
12
|
+
val regex = factory.create(pattern, rule.regexOptions)
|
16
13
|
|
17
14
|
return LowerCaseReplacer(regex)
|
18
15
|
}
|
@@ -0,0 +1,32 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.RegexOptions
|
4
|
+
|
5
|
+
class RegexFactory {
|
6
|
+
fun create(patternString: String, regexOptions: RegexOptions): Regex {
|
7
|
+
val options = buildOptions(regexOptions)
|
8
|
+
return Regex(patternString, options)
|
9
|
+
}
|
10
|
+
|
11
|
+
private fun buildOptions(regexOptions: RegexOptions): Set<RegexOption> {
|
12
|
+
val options = HashSet<RegexOption>()
|
13
|
+
|
14
|
+
if (regexOptions.ignoreCase) {
|
15
|
+
options.add(RegexOption.IGNORE_CASE)
|
16
|
+
}
|
17
|
+
|
18
|
+
if (regexOptions.multiline) {
|
19
|
+
options.add(RegexOption.MULTILINE)
|
20
|
+
}
|
21
|
+
|
22
|
+
if (regexOptions.dotMatchesAll) {
|
23
|
+
options.add(RegexOption.DOT_MATCHES_ALL)
|
24
|
+
}
|
25
|
+
|
26
|
+
if (regexOptions.enableComments) {
|
27
|
+
options.add(RegexOption.COMMENTS)
|
28
|
+
}
|
29
|
+
|
30
|
+
return options
|
31
|
+
}
|
32
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.config.Config
|
4
|
+
import org.embulk.config.ConfigDefault
|
5
|
+
import org.embulk.config.Task
|
6
|
+
|
7
|
+
interface RegexOptions : Task {
|
8
|
+
@get:Config("ignore_case")
|
9
|
+
@get:ConfigDefault("false")
|
10
|
+
val ignoreCase: Boolean
|
11
|
+
|
12
|
+
@get:Config("multiline")
|
13
|
+
@get:ConfigDefault("true")
|
14
|
+
val multiline: Boolean
|
15
|
+
|
16
|
+
@get:Config("dot_matches_all")
|
17
|
+
@get:ConfigDefault("false")
|
18
|
+
val dotMatchesAll: Boolean
|
19
|
+
|
20
|
+
@get:Config("enable_comments")
|
21
|
+
@get:ConfigDefault("false")
|
22
|
+
val enableComments: Boolean
|
23
|
+
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.filter.gsub
|
2
2
|
|
3
|
-
import org.embulk.filter.gsub.replacer.RegexFactory
|
4
3
|
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
5
4
|
import org.embulk.filter.gsub.replacer.RegexReplacer
|
6
5
|
import org.embulk.filter.gsub.replacer.TextReplacer
|
@@ -10,10 +9,8 @@ class RegexReplacerFactory : TextReplacerFactory() {
|
|
10
9
|
val pattern = rule.pattern.get()
|
11
10
|
val to = rule.to.get()
|
12
11
|
|
13
|
-
val regexOptionConfig = RegexOptionConfig()
|
14
|
-
|
15
12
|
val factory = RegexFactory()
|
16
|
-
val regex = factory.create(pattern,
|
13
|
+
val regex = factory.create(pattern, rule.regexOptions)
|
17
14
|
|
18
15
|
return RegexReplacer(regex, to)
|
19
16
|
}
|
@@ -1,6 +1,5 @@
|
|
1
1
|
package org.embulk.filter.gsub
|
2
2
|
|
3
|
-
import org.embulk.filter.gsub.replacer.RegexFactory
|
4
3
|
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
5
4
|
import org.embulk.filter.gsub.replacer.TextReplacer
|
6
5
|
import org.embulk.filter.gsub.replacer.UpperCaseReplacer
|
@@ -9,10 +8,8 @@ class UpperCaseReplacerFactory : TextReplacerFactory() {
|
|
9
8
|
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
9
|
val pattern = rule.pattern.orNull()
|
11
10
|
if (pattern != null) {
|
12
|
-
// TODO set regex options
|
13
|
-
val regexOptionConfig = RegexOptionConfig()
|
14
11
|
val factory = RegexFactory()
|
15
|
-
val regex = factory.create(pattern,
|
12
|
+
val regex = factory.create(pattern, rule.regexOptions)
|
16
13
|
|
17
14
|
return UpperCaseReplacer(regex)
|
18
15
|
}
|
@@ -57,6 +57,54 @@ class TestGsubFilterPlugin {
|
|
57
57
|
Assert.assertEquals("\\1 [\\2]", barRule2.to.get())
|
58
58
|
}
|
59
59
|
|
60
|
+
@Test
|
61
|
+
fun testDefaultRegexOptions() {
|
62
|
+
val configYaml = """
|
63
|
+
|type: gsub
|
64
|
+
|target_columns:
|
65
|
+
| foo:
|
66
|
+
| - type: regexp_replace
|
67
|
+
| pattern: "test"
|
68
|
+
""".trimMargin()
|
69
|
+
|
70
|
+
val config = getConfigFromYaml(configYaml)
|
71
|
+
val task = config.loadConfig(GsubFilterPlugin.PluginTask::class.java)
|
72
|
+
val fooRules = task.targetColumns["foo"]!!
|
73
|
+
val fooRule = fooRules[0]
|
74
|
+
val regexOptions = fooRule.regexOptions
|
75
|
+
Assert.assertFalse(regexOptions.ignoreCase)
|
76
|
+
Assert.assertTrue(regexOptions.multiline)
|
77
|
+
Assert.assertFalse(regexOptions.dotMatchesAll)
|
78
|
+
Assert.assertFalse(regexOptions.enableComments)
|
79
|
+
}
|
80
|
+
|
81
|
+
@Test
|
82
|
+
fun testRegexOptions() {
|
83
|
+
val configYaml = """
|
84
|
+
|type: gsub
|
85
|
+
|target_columns:
|
86
|
+
| foo:
|
87
|
+
| - type: regexp_replace
|
88
|
+
| pattern: "test"
|
89
|
+
| regexp_options:
|
90
|
+
| ignore_case: true
|
91
|
+
| multiline: true
|
92
|
+
| dot_matches_all: true
|
93
|
+
| enable_comments: true
|
94
|
+
""".trimMargin()
|
95
|
+
|
96
|
+
val config = getConfigFromYaml(configYaml)
|
97
|
+
val task = config.loadConfig(GsubFilterPlugin.PluginTask::class.java)
|
98
|
+
val fooRules = task.targetColumns["foo"]!!
|
99
|
+
val fooRule = fooRules[0]
|
100
|
+
val regexOptions = fooRule.regexOptions
|
101
|
+
|
102
|
+
Assert.assertTrue(regexOptions.ignoreCase)
|
103
|
+
Assert.assertTrue(regexOptions.multiline)
|
104
|
+
Assert.assertTrue(regexOptions.dotMatchesAll)
|
105
|
+
Assert.assertTrue(regexOptions.enableComments)
|
106
|
+
}
|
107
|
+
|
60
108
|
@Test
|
61
109
|
fun testEmptyFilter() {
|
62
110
|
val configYaml = """
|
@@ -1,19 +1,134 @@
|
|
1
1
|
package org.embulk.filter.gsub.replacer
|
2
2
|
|
3
|
+
import org.embulk.config.TaskSource
|
4
|
+
import org.embulk.filter.gsub.RegexFactory
|
5
|
+
import org.embulk.filter.gsub.RegexOptions
|
3
6
|
import org.junit.Assert
|
4
7
|
import org.junit.Test
|
5
8
|
|
6
9
|
class RegexReplacerTest {
|
7
10
|
@Test
|
8
11
|
fun testExecute() {
|
9
|
-
val
|
10
|
-
optionConfig.ignoreCase = true
|
12
|
+
val regexOptions = createRegexpOption()
|
11
13
|
|
12
14
|
val factory = RegexFactory()
|
13
|
-
val pattern = factory.create("(\\w*):\\s*(.*)",
|
15
|
+
val pattern = factory.create("(\\w*):\\s*(.*)", regexOptions)
|
14
16
|
|
15
17
|
val replacer = RegexReplacer(pattern, "$1 [$2]")
|
16
18
|
val result = replacer.execute("test: foo bar baz")
|
17
19
|
Assert.assertEquals("test [foo bar baz]", result)
|
18
20
|
}
|
21
|
+
|
22
|
+
@Test
|
23
|
+
fun testExecuteWithoutIgnoreCaseOption() {
|
24
|
+
val regexOptions = createRegexpOption(ignoreCase = false)
|
25
|
+
|
26
|
+
val factory = RegexFactory()
|
27
|
+
val pattern = factory.create("foo", regexOptions)
|
28
|
+
|
29
|
+
val replacer = RegexReplacer(pattern, "*test-foo*")
|
30
|
+
|
31
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("foo bar baz"))
|
32
|
+
Assert.assertEquals("Foo bar baz", replacer.execute("Foo bar baz"))
|
33
|
+
Assert.assertEquals("FOO bar baz", replacer.execute("FOO bar baz"))
|
34
|
+
}
|
35
|
+
|
36
|
+
@Test
|
37
|
+
fun testExecuteWithIgnoreCaseOption() {
|
38
|
+
val regexOptions = createRegexpOption(ignoreCase = true)
|
39
|
+
val factory = RegexFactory()
|
40
|
+
val pattern = factory.create("foo", regexOptions)
|
41
|
+
|
42
|
+
val replacer = RegexReplacer(pattern, "*test-foo*")
|
43
|
+
|
44
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("foo bar baz"))
|
45
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("Foo bar baz"))
|
46
|
+
Assert.assertEquals("*test-foo* bar baz", replacer.execute("FOO bar baz"))
|
47
|
+
}
|
48
|
+
|
49
|
+
@Test
|
50
|
+
fun testExecuteWithoutMultilineOption() {
|
51
|
+
val regexOptions = createRegexpOption(multiline = false)
|
52
|
+
val factory = RegexFactory()
|
53
|
+
val pattern = factory.create("^bar", regexOptions)
|
54
|
+
|
55
|
+
val replacer = RegexReplacer(pattern, "*BAR*")
|
56
|
+
|
57
|
+
Assert.assertEquals("foo\nbar\nbaz", replacer.execute("foo\nbar\nbaz"))
|
58
|
+
}
|
59
|
+
|
60
|
+
@Test
|
61
|
+
fun testExecuteWithMultilineOption() {
|
62
|
+
val regexOptions = createRegexpOption(multiline = true)
|
63
|
+
val factory = RegexFactory()
|
64
|
+
val pattern = factory.create("^bar", regexOptions)
|
65
|
+
|
66
|
+
val replacer = RegexReplacer(pattern, "*BAR*")
|
67
|
+
|
68
|
+
Assert.assertEquals("foo\n*BAR*\nbaz", replacer.execute("foo\nbar\nbaz"))
|
69
|
+
}
|
70
|
+
|
71
|
+
@Test
|
72
|
+
fun testExecuteWithoutDotMatchesAllOption() {
|
73
|
+
val regexOptions = createRegexpOption(dotMatchesAll = false)
|
74
|
+
val factory = RegexFactory()
|
75
|
+
val pattern = factory.create("foo.bar.baz", regexOptions)
|
76
|
+
|
77
|
+
val replacer = RegexReplacer(pattern, "[foo-bar-baz]")
|
78
|
+
|
79
|
+
Assert.assertEquals("[foo-bar-baz]", replacer.execute("foo/bar/baz"))
|
80
|
+
Assert.assertEquals("foo\nbar/baz", replacer.execute("foo\nbar/baz"))
|
81
|
+
}
|
82
|
+
|
83
|
+
@Test
|
84
|
+
fun testExecuteWithDotMatchesAllOption() {
|
85
|
+
val regexOptions = createRegexpOption(dotMatchesAll = true)
|
86
|
+
val factory = RegexFactory()
|
87
|
+
val pattern = factory.create("foo.bar.baz", regexOptions)
|
88
|
+
|
89
|
+
val replacer = RegexReplacer(pattern, "[foo-bar-baz]")
|
90
|
+
|
91
|
+
Assert.assertEquals("[foo-bar-baz]", replacer.execute("foo/bar/baz"))
|
92
|
+
Assert.assertEquals("[foo-bar-baz]", replacer.execute("foo\nbar/baz"))
|
93
|
+
}
|
94
|
+
|
95
|
+
@Test
|
96
|
+
fun testExecuteWithEnableCommentsOption() {
|
97
|
+
val regexOptions = createRegexpOption(enableComments = true)
|
98
|
+
val factory = RegexFactory()
|
99
|
+
|
100
|
+
val patternString = """
|
101
|
+
|(ba\w) # matches bar and baz
|
102
|
+
""".trimMargin()
|
103
|
+
val pattern = factory.create(patternString, regexOptions)
|
104
|
+
|
105
|
+
val replacer = RegexReplacer(pattern, "*$1*")
|
106
|
+
|
107
|
+
Assert.assertEquals("foo *bar* *baz*", replacer.execute("foo bar baz"))
|
108
|
+
}
|
109
|
+
|
110
|
+
private fun createRegexpOption(
|
111
|
+
ignoreCase: Boolean = false,
|
112
|
+
multiline: Boolean = true,
|
113
|
+
dotMatchesAll: Boolean = false,
|
114
|
+
enableComments: Boolean = false
|
115
|
+
): RegexOptions {
|
116
|
+
return object: RegexOptions {
|
117
|
+
override val ignoreCase: Boolean
|
118
|
+
get() = ignoreCase
|
119
|
+
override val multiline: Boolean
|
120
|
+
get() = multiline
|
121
|
+
override val dotMatchesAll: Boolean
|
122
|
+
get() = dotMatchesAll
|
123
|
+
override val enableComments: Boolean
|
124
|
+
get() = enableComments
|
125
|
+
|
126
|
+
override fun validate() {
|
127
|
+
}
|
128
|
+
|
129
|
+
override fun dump(): TaskSource {
|
130
|
+
throw NotImplementedError()
|
131
|
+
}
|
132
|
+
}
|
133
|
+
}
|
19
134
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-filter-gsub
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sawada Tadashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-02-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,7 +38,7 @@ dependencies:
|
|
38
38
|
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
|
-
description:
|
41
|
+
description: Embulk filter plugin to convert text column values with regular expressions
|
42
42
|
email:
|
43
43
|
- cesare@mayverse.jp
|
44
44
|
executables: []
|
@@ -60,13 +60,14 @@ files:
|
|
60
60
|
- src/main/kotlin/org/embulk/filter/gsub/ColumnVisitorImpl.kt
|
61
61
|
- src/main/kotlin/org/embulk/filter/gsub/GsubFilterPlugin.kt
|
62
62
|
- src/main/kotlin/org/embulk/filter/gsub/LowerCaseReplacerFactory.kt
|
63
|
+
- src/main/kotlin/org/embulk/filter/gsub/RegexFactory.kt
|
64
|
+
- src/main/kotlin/org/embulk/filter/gsub/RegexOptions.kt
|
63
65
|
- src/main/kotlin/org/embulk/filter/gsub/RegexReplacerFactory.kt
|
64
66
|
- src/main/kotlin/org/embulk/filter/gsub/SubstitutionRule.kt
|
65
67
|
- src/main/kotlin/org/embulk/filter/gsub/TextReplacerFactory.kt
|
66
68
|
- src/main/kotlin/org/embulk/filter/gsub/UpperCaseReplacerFactory.kt
|
67
69
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/CombinedReplacer.kt
|
68
70
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacer.kt
|
69
|
-
- src/main/kotlin/org/embulk/filter/gsub/replacer/RegexFactory.kt
|
70
71
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/RegexOptionConfig.kt
|
71
72
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/RegexReplacer.kt
|
72
73
|
- src/main/kotlin/org/embulk/filter/gsub/replacer/TextReplacer.kt
|
@@ -76,8 +77,8 @@ files:
|
|
76
77
|
- src/test/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacerTest.kt
|
77
78
|
- src/test/kotlin/org/embulk/filter/gsub/replacer/RegexReplacerTest.kt
|
78
79
|
- classpath/annotations-13.0.jar
|
79
|
-
- classpath/embulk-filter-gsub-0.
|
80
|
-
- classpath/kotlin-stdlib-1.2.
|
80
|
+
- classpath/embulk-filter-gsub-0.2.0.jar
|
81
|
+
- classpath/kotlin-stdlib-1.2.21.jar
|
81
82
|
homepage: https://github.com/cesare/embulk-filter-gsub
|
82
83
|
licenses:
|
83
84
|
- MIT
|
@@ -1,22 +0,0 @@
|
|
1
|
-
package org.embulk.filter.gsub.replacer
|
2
|
-
|
3
|
-
class RegexFactory {
|
4
|
-
fun create(patternString: String, regexOptionConfig: RegexOptionConfig): Regex {
|
5
|
-
val options = buildOptions(regexOptionConfig)
|
6
|
-
return Regex(patternString, options)
|
7
|
-
}
|
8
|
-
|
9
|
-
private fun buildOptions(optionConfig: RegexOptionConfig): Set<RegexOption> {
|
10
|
-
val options = HashSet<RegexOption>()
|
11
|
-
|
12
|
-
if (optionConfig.ignoreCase) {
|
13
|
-
options.add(RegexOption.IGNORE_CASE)
|
14
|
-
}
|
15
|
-
|
16
|
-
if (optionConfig.multiline) {
|
17
|
-
options.add(RegexOption.MULTILINE)
|
18
|
-
}
|
19
|
-
|
20
|
-
return options
|
21
|
-
}
|
22
|
-
}
|