embulk-filter-gsub 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.circleci/config.yml +42 -0
- data/.gitignore +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +141 -0
- data/build.gradle +49 -0
- data/gradle.properties +1 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +172 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/filter/gsub.rb +3 -0
- data/src/main/kotlin/org/embulk/filter/gsub/ColumnReplacerFactory.kt +20 -0
- data/src/main/kotlin/org/embulk/filter/gsub/ColumnVisitorImpl.kt +72 -0
- data/src/main/kotlin/org/embulk/filter/gsub/GsubFilterPlugin.kt +51 -0
- data/src/main/kotlin/org/embulk/filter/gsub/LowerCaseReplacerFactory.kt +23 -0
- data/src/main/kotlin/org/embulk/filter/gsub/RegexReplacerFactory.kt +20 -0
- data/src/main/kotlin/org/embulk/filter/gsub/SubstitutionRule.kt +26 -0
- data/src/main/kotlin/org/embulk/filter/gsub/TextReplacerFactory.kt +33 -0
- data/src/main/kotlin/org/embulk/filter/gsub/UpperCaseReplacerFactory.kt +23 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/CombinedReplacer.kt +7 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacer.kt +17 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexFactory.kt +22 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexOptionConfig.kt +6 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexReplacer.kt +7 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/TextReplacer.kt +5 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/UpperCaseReplacer.kt +18 -0
- data/src/test/kotlin/org/embulk/filter/gsub/TestGsubFilterPlugin.kt +221 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/CombinedReplacerTest.kt +37 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacerTest.kt +20 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/RegexReplacerTest.kt +19 -0
- metadata +105 -0
data/gradlew.bat
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
@if "%DEBUG%" == "" @echo off
|
2
|
+
@rem ##########################################################################
|
3
|
+
@rem
|
4
|
+
@rem Gradle startup script for Windows
|
5
|
+
@rem
|
6
|
+
@rem ##########################################################################
|
7
|
+
|
8
|
+
@rem Set local scope for the variables with windows NT shell
|
9
|
+
if "%OS%"=="Windows_NT" setlocal
|
10
|
+
|
11
|
+
set DIRNAME=%~dp0
|
12
|
+
if "%DIRNAME%" == "" set DIRNAME=.
|
13
|
+
set APP_BASE_NAME=%~n0
|
14
|
+
set APP_HOME=%DIRNAME%
|
15
|
+
|
16
|
+
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
17
|
+
set DEFAULT_JVM_OPTS=
|
18
|
+
|
19
|
+
@rem Find java.exe
|
20
|
+
if defined JAVA_HOME goto findJavaFromJavaHome
|
21
|
+
|
22
|
+
set JAVA_EXE=java.exe
|
23
|
+
%JAVA_EXE% -version >NUL 2>&1
|
24
|
+
if "%ERRORLEVEL%" == "0" goto init
|
25
|
+
|
26
|
+
echo.
|
27
|
+
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
28
|
+
echo.
|
29
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
30
|
+
echo location of your Java installation.
|
31
|
+
|
32
|
+
goto fail
|
33
|
+
|
34
|
+
:findJavaFromJavaHome
|
35
|
+
set JAVA_HOME=%JAVA_HOME:"=%
|
36
|
+
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
|
37
|
+
|
38
|
+
if exist "%JAVA_EXE%" goto init
|
39
|
+
|
40
|
+
echo.
|
41
|
+
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
|
42
|
+
echo.
|
43
|
+
echo Please set the JAVA_HOME variable in your environment to match the
|
44
|
+
echo location of your Java installation.
|
45
|
+
|
46
|
+
goto fail
|
47
|
+
|
48
|
+
:init
|
49
|
+
@rem Get command-line arguments, handling Windows variants
|
50
|
+
|
51
|
+
if not "%OS%" == "Windows_NT" goto win9xME_args
|
52
|
+
|
53
|
+
:win9xME_args
|
54
|
+
@rem Slurp the command line arguments.
|
55
|
+
set CMD_LINE_ARGS=
|
56
|
+
set _SKIP=2
|
57
|
+
|
58
|
+
:win9xME_args_slurp
|
59
|
+
if "x%~1" == "x" goto execute
|
60
|
+
|
61
|
+
set CMD_LINE_ARGS=%*
|
62
|
+
|
63
|
+
:execute
|
64
|
+
@rem Setup the command line
|
65
|
+
|
66
|
+
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
|
67
|
+
|
68
|
+
@rem Execute Gradle
|
69
|
+
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
|
70
|
+
|
71
|
+
:end
|
72
|
+
@rem End local scope for the variables with windows NT shell
|
73
|
+
if "%ERRORLEVEL%"=="0" goto mainEnd
|
74
|
+
|
75
|
+
:fail
|
76
|
+
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
|
77
|
+
rem the _cmd.exe /c_ return code!
|
78
|
+
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
|
79
|
+
exit /b 1
|
80
|
+
|
81
|
+
:mainEnd
|
82
|
+
if "%OS%"=="Windows_NT" endlocal
|
83
|
+
|
84
|
+
:omega
|
@@ -0,0 +1,20 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.replacer.*
|
4
|
+
|
5
|
+
class ColumnReplacerFactory {
|
6
|
+
fun create(task: GsubFilterPlugin.PluginTask): Map<String, TextReplacer> {
|
7
|
+
return task.targetColumns.mapValues { column ->
|
8
|
+
createReplacerForRules(column.value)
|
9
|
+
}
|
10
|
+
}
|
11
|
+
|
12
|
+
private fun createReplacerForRules(rules: List<SubstitutionRule>): TextReplacer {
|
13
|
+
val replacers = rules.map { createReplacer(it) }
|
14
|
+
return CombinedReplacer(replacers)
|
15
|
+
}
|
16
|
+
|
17
|
+
private fun createReplacer(rule: SubstitutionRule): TextReplacer {
|
18
|
+
return TextReplacerFactory.create(rule)
|
19
|
+
}
|
20
|
+
}
|
@@ -0,0 +1,72 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.GsubFilterPlugin.PluginTask
|
4
|
+
import org.embulk.filter.gsub.replacer.TextReplacer
|
5
|
+
import org.embulk.spi.*
|
6
|
+
|
7
|
+
class ColumnVisitorImpl
|
8
|
+
constructor(
|
9
|
+
task: PluginTask,
|
10
|
+
private val pageReader: PageReader,
|
11
|
+
private val pageBuilder: PageBuilder
|
12
|
+
) : ColumnVisitor {
|
13
|
+
|
14
|
+
private val columnReplacers: Map<String, TextReplacer> = ColumnReplacerFactory().create(task)
|
15
|
+
|
16
|
+
override fun booleanColumn(column: Column) {
|
17
|
+
if (pageReader.isNull(column)) {
|
18
|
+
pageBuilder.setNull(column)
|
19
|
+
}
|
20
|
+
else {
|
21
|
+
pageBuilder.setBoolean(column, pageReader.getBoolean(column))
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
override fun longColumn(column: Column) {
|
26
|
+
if (pageReader.isNull(column)) {
|
27
|
+
pageBuilder.setNull(column)
|
28
|
+
}
|
29
|
+
else {
|
30
|
+
pageBuilder.setLong(column, pageReader.getLong(column))
|
31
|
+
}
|
32
|
+
}
|
33
|
+
|
34
|
+
override fun doubleColumn(column: Column) {
|
35
|
+
if (pageReader.isNull(column)) {
|
36
|
+
pageBuilder.setNull(column)
|
37
|
+
}
|
38
|
+
else {
|
39
|
+
pageBuilder.setDouble(column, pageReader.getDouble(column))
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
override fun stringColumn(column: Column) {
|
44
|
+
if (pageReader.isNull(column)) {
|
45
|
+
pageBuilder.setNull(column)
|
46
|
+
}
|
47
|
+
else {
|
48
|
+
val text = pageReader.getString(column)
|
49
|
+
val replacer = columnReplacers[column.name]
|
50
|
+
val replacedText = replacer?.let { it.execute(text) } ?: text
|
51
|
+
pageBuilder.setString(column, replacedText)
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
override fun timestampColumn(column: Column) {
|
56
|
+
if (pageReader.isNull(column)) {
|
57
|
+
pageBuilder.setNull(column)
|
58
|
+
}
|
59
|
+
else {
|
60
|
+
pageBuilder.setTimestamp(column, pageReader.getTimestamp(column))
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
override fun jsonColumn(column: Column) {
|
65
|
+
if (pageReader.isNull(column)) {
|
66
|
+
pageBuilder.setNull(column)
|
67
|
+
}
|
68
|
+
else {
|
69
|
+
pageBuilder.setJson(column, pageReader.getJson(column))
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
@@ -0,0 +1,51 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.config.Config
|
4
|
+
import org.embulk.config.ConfigDefault
|
5
|
+
import org.embulk.config.ConfigSource
|
6
|
+
import org.embulk.config.Task
|
7
|
+
import org.embulk.config.TaskSource
|
8
|
+
import org.embulk.spi.*
|
9
|
+
|
10
|
+
class GsubFilterPlugin : FilterPlugin {
|
11
|
+
interface PluginTask : Task {
|
12
|
+
@get:Config("target_columns")
|
13
|
+
@get:ConfigDefault("{}")
|
14
|
+
val targetColumns: Map<String, List<SubstitutionRule>>
|
15
|
+
}
|
16
|
+
|
17
|
+
override fun transaction(config: ConfigSource, inputSchema: Schema,
|
18
|
+
control: FilterPlugin.Control) {
|
19
|
+
val task = config.loadConfig<PluginTask>(PluginTask::class.java)
|
20
|
+
|
21
|
+
control.run(task.dump(), inputSchema)
|
22
|
+
}
|
23
|
+
|
24
|
+
override fun open(taskSource: TaskSource, inputSchema: Schema,
|
25
|
+
outputSchema: Schema, output: PageOutput): PageOutput {
|
26
|
+
val task = taskSource.loadTask<PluginTask>(PluginTask::class.java)
|
27
|
+
|
28
|
+
return object: PageOutput {
|
29
|
+
val pageReader = PageReader(inputSchema)
|
30
|
+
val pageBuilder = PageBuilder(Exec.getBufferAllocator(), outputSchema, output)
|
31
|
+
val columnVisitor = ColumnVisitorImpl(task, pageReader, pageBuilder)
|
32
|
+
|
33
|
+
override fun add(page: Page) {
|
34
|
+
pageReader.setPage(page)
|
35
|
+
|
36
|
+
while (pageReader.nextRecord()) {
|
37
|
+
inputSchema.visitColumns(columnVisitor)
|
38
|
+
pageBuilder.addRecord()
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
override fun finish() {
|
43
|
+
pageBuilder.finish()
|
44
|
+
}
|
45
|
+
|
46
|
+
override fun close() {
|
47
|
+
pageBuilder.close()
|
48
|
+
}
|
49
|
+
}
|
50
|
+
}
|
51
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.replacer.LowerCaseReplacer
|
4
|
+
import org.embulk.filter.gsub.replacer.RegexFactory
|
5
|
+
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
6
|
+
import org.embulk.filter.gsub.replacer.TextReplacer
|
7
|
+
|
8
|
+
class LowerCaseReplacerFactory : TextReplacerFactory() {
|
9
|
+
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
|
+
val pattern = rule.pattern.orNull()
|
11
|
+
if (pattern != null) {
|
12
|
+
// TODO set regex options
|
13
|
+
val regexOptionConfig = RegexOptionConfig()
|
14
|
+
val factory = RegexFactory()
|
15
|
+
val regex = factory.create(pattern, regexOptionConfig)
|
16
|
+
|
17
|
+
return LowerCaseReplacer(regex)
|
18
|
+
}
|
19
|
+
else {
|
20
|
+
return LowerCaseReplacer()
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.replacer.RegexFactory
|
4
|
+
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
5
|
+
import org.embulk.filter.gsub.replacer.RegexReplacer
|
6
|
+
import org.embulk.filter.gsub.replacer.TextReplacer
|
7
|
+
|
8
|
+
class RegexReplacerFactory : TextReplacerFactory() {
|
9
|
+
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
|
+
val pattern = rule.pattern.get()
|
11
|
+
val to = rule.to.get()
|
12
|
+
|
13
|
+
val regexOptionConfig = RegexOptionConfig()
|
14
|
+
|
15
|
+
val factory = RegexFactory()
|
16
|
+
val regex = factory.create(pattern, regexOptionConfig)
|
17
|
+
|
18
|
+
return RegexReplacer(regex, to)
|
19
|
+
}
|
20
|
+
}
|
@@ -0,0 +1,26 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import com.google.common.base.Optional
|
4
|
+
import org.embulk.config.Config
|
5
|
+
import org.embulk.config.ConfigDefault
|
6
|
+
import org.embulk.config.Task
|
7
|
+
|
8
|
+
interface SubstitutionRule : Task {
|
9
|
+
enum class SubstitutionType(val label: String) {
|
10
|
+
REGEXP_REPLACE("regexp_replace"),
|
11
|
+
TO_UPPER_CASE("to_upper_case"),
|
12
|
+
TO_LOWER_CASE("to_lower_case"),
|
13
|
+
}
|
14
|
+
|
15
|
+
@get:Config("type")
|
16
|
+
@get:ConfigDefault("\"regexp_replace\"")
|
17
|
+
val type: String
|
18
|
+
|
19
|
+
@get:Config("pattern")
|
20
|
+
@get:ConfigDefault("null")
|
21
|
+
val pattern: Optional<String>
|
22
|
+
|
23
|
+
@get:Config("to")
|
24
|
+
@get:ConfigDefault("null")
|
25
|
+
val to: Optional<String>
|
26
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.replacer.TextReplacer
|
4
|
+
import org.embulk.filter.gsub.SubstitutionRule.SubstitutionType
|
5
|
+
|
6
|
+
abstract class TextReplacerFactory {
|
7
|
+
abstract fun create(rule: SubstitutionRule): TextReplacer
|
8
|
+
|
9
|
+
companion object Factory {
|
10
|
+
fun create(rule: SubstitutionRule): TextReplacer {
|
11
|
+
val factory = createFactory(rule)
|
12
|
+
return factory.create(rule)
|
13
|
+
}
|
14
|
+
|
15
|
+
private fun createFactory(rule: SubstitutionRule): TextReplacerFactory {
|
16
|
+
val type = findSubstitutionType(rule.type)
|
17
|
+
return when (type) {
|
18
|
+
SubstitutionType.REGEXP_REPLACE -> RegexReplacerFactory()
|
19
|
+
SubstitutionType.TO_UPPER_CASE -> UpperCaseReplacerFactory()
|
20
|
+
SubstitutionType.TO_LOWER_CASE -> LowerCaseReplacerFactory()
|
21
|
+
}
|
22
|
+
}
|
23
|
+
|
24
|
+
private fun findSubstitutionType(typeName: String): SubstitutionType {
|
25
|
+
try {
|
26
|
+
return SubstitutionType.valueOf(typeName.toUpperCase())
|
27
|
+
}
|
28
|
+
catch (e: IllegalArgumentException) {
|
29
|
+
throw RuntimeException("Unknown substitution type: ${typeName}", e)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
}
|
33
|
+
}
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.filter.gsub
|
2
|
+
|
3
|
+
import org.embulk.filter.gsub.replacer.RegexFactory
|
4
|
+
import org.embulk.filter.gsub.replacer.RegexOptionConfig
|
5
|
+
import org.embulk.filter.gsub.replacer.TextReplacer
|
6
|
+
import org.embulk.filter.gsub.replacer.UpperCaseReplacer
|
7
|
+
|
8
|
+
class UpperCaseReplacerFactory : TextReplacerFactory() {
|
9
|
+
override fun create(rule: SubstitutionRule): TextReplacer {
|
10
|
+
val pattern = rule.pattern.orNull()
|
11
|
+
if (pattern != null) {
|
12
|
+
// TODO set regex options
|
13
|
+
val regexOptionConfig = RegexOptionConfig()
|
14
|
+
val factory = RegexFactory()
|
15
|
+
val regex = factory.create(pattern, regexOptionConfig)
|
16
|
+
|
17
|
+
return UpperCaseReplacer(regex)
|
18
|
+
}
|
19
|
+
else {
|
20
|
+
return UpperCaseReplacer()
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
@@ -0,0 +1,17 @@
|
|
1
|
+
package org.embulk.filter.gsub.replacer
|
2
|
+
|
3
|
+
class LowerCaseReplacer(private val pattern: Regex?) : TextReplacer {
|
4
|
+
constructor() : this(null)
|
5
|
+
|
6
|
+
override fun execute(text: String): String {
|
7
|
+
return pattern?.let { replaceWithPattern(it, text)} ?: replaceWholeText(text)
|
8
|
+
}
|
9
|
+
|
10
|
+
private fun replaceWithPattern(pattern: Regex, text: String): String {
|
11
|
+
return pattern.replace(text, { matchResult -> matchResult.value.toLowerCase() })
|
12
|
+
}
|
13
|
+
|
14
|
+
private fun replaceWholeText(text: String): String {
|
15
|
+
return text.toLowerCase()
|
16
|
+
}
|
17
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
package org.embulk.filter.gsub.replacer
|
2
|
+
|
3
|
+
class RegexFactory {
|
4
|
+
fun create(patternString: String, regexOptionConfig: RegexOptionConfig): Regex {
|
5
|
+
val options = buildOptions(regexOptionConfig)
|
6
|
+
return Regex(patternString, options)
|
7
|
+
}
|
8
|
+
|
9
|
+
private fun buildOptions(optionConfig: RegexOptionConfig): Set<RegexOption> {
|
10
|
+
val options = HashSet<RegexOption>()
|
11
|
+
|
12
|
+
if (optionConfig.ignoreCase) {
|
13
|
+
options.add(RegexOption.IGNORE_CASE)
|
14
|
+
}
|
15
|
+
|
16
|
+
if (optionConfig.multiline) {
|
17
|
+
options.add(RegexOption.MULTILINE)
|
18
|
+
}
|
19
|
+
|
20
|
+
return options
|
21
|
+
}
|
22
|
+
}
|