embulk-filter-gsub 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.circleci/config.yml +42 -0
- data/.gitignore +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +141 -0
- data/build.gradle +49 -0
- data/gradle.properties +1 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +172 -0
- data/gradlew.bat +84 -0
- data/lib/embulk/filter/gsub.rb +3 -0
- data/src/main/kotlin/org/embulk/filter/gsub/ColumnReplacerFactory.kt +20 -0
- data/src/main/kotlin/org/embulk/filter/gsub/ColumnVisitorImpl.kt +72 -0
- data/src/main/kotlin/org/embulk/filter/gsub/GsubFilterPlugin.kt +51 -0
- data/src/main/kotlin/org/embulk/filter/gsub/LowerCaseReplacerFactory.kt +23 -0
- data/src/main/kotlin/org/embulk/filter/gsub/RegexReplacerFactory.kt +20 -0
- data/src/main/kotlin/org/embulk/filter/gsub/SubstitutionRule.kt +26 -0
- data/src/main/kotlin/org/embulk/filter/gsub/TextReplacerFactory.kt +33 -0
- data/src/main/kotlin/org/embulk/filter/gsub/UpperCaseReplacerFactory.kt +23 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/CombinedReplacer.kt +7 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacer.kt +17 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexFactory.kt +22 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexOptionConfig.kt +6 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/RegexReplacer.kt +7 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/TextReplacer.kt +5 -0
- data/src/main/kotlin/org/embulk/filter/gsub/replacer/UpperCaseReplacer.kt +18 -0
- data/src/test/kotlin/org/embulk/filter/gsub/TestGsubFilterPlugin.kt +221 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/CombinedReplacerTest.kt +37 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/LowerCaseReplacerTest.kt +20 -0
- data/src/test/kotlin/org/embulk/filter/gsub/replacer/RegexReplacerTest.kt +19 -0
- metadata +105 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: fffdfbb5f26bf7c3ee0abf13c355b29080a09336
|
4
|
+
data.tar.gz: 96e001cd2b5162bd6bf50c7ec239e2134496ca9d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7929b95e1cb3aa18b800305a835d385ccd6245fc6eb0421c667a40468936a587d4a8523e889d9c36eead27fa94109556b9ff7e361476016a9f68ab805d654d54
|
7
|
+
data.tar.gz: e0a1dce998d24320a9c188dd4cb0ea4189bbad9a148be600a2f09f0c084f9251c435208946bc28f6ae12fb244857ebb168da89cba829676eb1fe31a62ebdfb92
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Java Gradle CircleCI 2.0 configuration file
|
2
|
+
#
|
3
|
+
# Check https://circleci.com/docs/2.0/language-java/ for more details
|
4
|
+
#
|
5
|
+
version: 2
|
6
|
+
jobs:
|
7
|
+
build:
|
8
|
+
docker:
|
9
|
+
# specify the version you desire here
|
10
|
+
- image: circleci/openjdk:8-jdk
|
11
|
+
|
12
|
+
# Specify service dependencies here if necessary
|
13
|
+
# CircleCI maintains a library of pre-built images
|
14
|
+
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
+
# - image: circleci/postgres:9.4
|
16
|
+
|
17
|
+
working_directory: ~/repo
|
18
|
+
|
19
|
+
environment:
|
20
|
+
# Customize the JVM maximum heap limit
|
21
|
+
JVM_OPTS: -Xmx3200m
|
22
|
+
TERM: dumb
|
23
|
+
|
24
|
+
steps:
|
25
|
+
- checkout
|
26
|
+
|
27
|
+
# Download and cache dependencies
|
28
|
+
- restore_cache:
|
29
|
+
keys:
|
30
|
+
- v1-dependencies-{{ checksum "build.gradle" }}
|
31
|
+
# fallback to using the latest cache if no exact match is found
|
32
|
+
- v1-dependencies-
|
33
|
+
|
34
|
+
- run: gradle dependencies
|
35
|
+
|
36
|
+
- save_cache:
|
37
|
+
paths:
|
38
|
+
- ~/.gradle
|
39
|
+
key: v1-dependencies-{{ checksum "build.gradle" }}
|
40
|
+
|
41
|
+
# run tests!
|
42
|
+
- run: gradle test
|
data/.gitignore
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
# Gsub filter plugin for Embulk
|
2
|
+
|
3
|
+
[](https://circleci.com/gh/cesare/embulk-filter-gsub)
|
4
|
+
|
5
|
+
Embulk filter plugin to convert text column values with regular expressions.
|
6
|
+
|
7
|
+
## Overview
|
8
|
+
|
9
|
+
* **Plugin type**: filter
|
10
|
+
|
11
|
+
## Configuration
|
12
|
+
|
13
|
+
- **target_columns**: columns to convert text value (array, default: `[]`)
|
14
|
+
|
15
|
+
### Example
|
16
|
+
|
17
|
+
```yaml
|
18
|
+
filters:
|
19
|
+
- type: gsub
|
20
|
+
target_columns:
|
21
|
+
foo:
|
22
|
+
- type: regexp_replace
|
23
|
+
pattern: '(\w*):\s*(\w*)'
|
24
|
+
to: "$1 = [$2]"
|
25
|
+
bar:
|
26
|
+
- type: to_lower_case
|
27
|
+
baz:
|
28
|
+
- type: to_upper_case
|
29
|
+
pattern: "test"
|
30
|
+
```
|
31
|
+
|
32
|
+
### target column configuration
|
33
|
+
|
34
|
+
- **type**: type of text substitution (string, default: `regexp_replace`)
|
35
|
+
|
36
|
+
Supported Types are:
|
37
|
+
|
38
|
+
* regexp_replace
|
39
|
+
* to_lower_case
|
40
|
+
* to_upper_case
|
41
|
+
|
42
|
+
#### regexp_replace
|
43
|
+
|
44
|
+
- **pattern**: regular expression pattern to be substituted (string, required)
|
45
|
+
- **to**: replacement string (string, required)
|
46
|
+
|
47
|
+
##### Example
|
48
|
+
|
49
|
+
```yaml
|
50
|
+
target_columns:
|
51
|
+
foo:
|
52
|
+
- type: regexp_replace
|
53
|
+
pattern: '(\w*):\s*(\w*)'
|
54
|
+
to: "$1 = [$2]"
|
55
|
+
```
|
56
|
+
|
57
|
+
it converts input like this
|
58
|
+
|
59
|
+
foo | bar
|
60
|
+
-----|-----
|
61
|
+
example-foo: 1234 | example-bar: 9876
|
62
|
+
|
63
|
+
into the output
|
64
|
+
|
65
|
+
foo | bar
|
66
|
+
-----|-----
|
67
|
+
example-foo = [1234] | example-bar: 9876
|
68
|
+
|
69
|
+
#### to_lower_case
|
70
|
+
|
71
|
+
- **pattern**: regular expression pattern to be substituted (string, optional)
|
72
|
+
|
73
|
+
If `pattern` is omitted, whole text is converted into lower case letters.
|
74
|
+
|
75
|
+
##### Example
|
76
|
+
|
77
|
+
```yaml
|
78
|
+
target_columns:
|
79
|
+
foo:
|
80
|
+
- type: to_lower_case
|
81
|
+
```
|
82
|
+
|
83
|
+
it converts input like this
|
84
|
+
|
85
|
+
foo | bar
|
86
|
+
-----|-----
|
87
|
+
ABC | XYZ
|
88
|
+
|
89
|
+
into the output
|
90
|
+
|
91
|
+
foo | bar
|
92
|
+
-----|-----
|
93
|
+
abc | XYZ
|
94
|
+
|
95
|
+
#### to_upper_case
|
96
|
+
|
97
|
+
- **pattern**: regular expression pattern to be substituted (string, optional)
|
98
|
+
|
99
|
+
If `pattern` is omitted, whole text is converted into upper case letters.
|
100
|
+
|
101
|
+
##### Example
|
102
|
+
|
103
|
+
```yaml
|
104
|
+
target_columns:
|
105
|
+
foo:
|
106
|
+
- type: to_upper_case
|
107
|
+
```
|
108
|
+
|
109
|
+
it converts input like this
|
110
|
+
|
111
|
+
foo | bar
|
112
|
+
-----|-----
|
113
|
+
abc | xyz
|
114
|
+
|
115
|
+
into the output
|
116
|
+
|
117
|
+
foo | bar
|
118
|
+
-----|-----
|
119
|
+
ABC | xyz
|
120
|
+
|
121
|
+
### Multiple conversion
|
122
|
+
|
123
|
+
You can apply multiple conversion on a column value.
|
124
|
+
|
125
|
+
```yaml
|
126
|
+
target_columns:
|
127
|
+
foo:
|
128
|
+
- type: regexp_replace
|
129
|
+
pattern: '</?\w*\s*/?>'
|
130
|
+
to: ""
|
131
|
+
- type: regexp_replace
|
132
|
+
pattern: '(\w*):\s*(\w*)'
|
133
|
+
to: "$1 = [$2]"
|
134
|
+
```
|
135
|
+
|
136
|
+
|
137
|
+
## Build
|
138
|
+
|
139
|
+
```
|
140
|
+
$ ./gradlew gem # -t to watch change of files and rebuild continuously
|
141
|
+
```
|
data/build.gradle
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
buildscript {
|
2
|
+
ext.kotlin_version = '1.2.10'
|
3
|
+
repositories {
|
4
|
+
mavenCentral()
|
5
|
+
jcenter()
|
6
|
+
maven { url 'http://kamatama41.github.com/maven-repository/repository' }
|
7
|
+
}
|
8
|
+
dependencies {
|
9
|
+
classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
|
10
|
+
classpath "com.github.kamatama41:gradle-embulk-plugin:0.1.4"
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
apply plugin: "kotlin"
|
15
|
+
apply plugin: "com.github.kamatama41.embulk"
|
16
|
+
|
17
|
+
embulk {
|
18
|
+
version = "0.8.38"
|
19
|
+
category = "filter"
|
20
|
+
name = "gsub"
|
21
|
+
authors = ["Sawada Tadashi"]
|
22
|
+
email = "cesare@mayverse.jp"
|
23
|
+
homepage = "https://github.com/cesare/embulk-filter-gsub"
|
24
|
+
}
|
25
|
+
|
26
|
+
repositories {
|
27
|
+
mavenCentral()
|
28
|
+
}
|
29
|
+
|
30
|
+
dependencies {
|
31
|
+
compile "org.jetbrains.kotlin:kotlin-stdlib:$kotlin_version"
|
32
|
+
testCompile "junit:junit:4.12", {
|
33
|
+
transitive = false
|
34
|
+
}
|
35
|
+
testCompile "org.hamcrest:hamcrest-all:1.3"
|
36
|
+
testCompile "org.embulk:embulk-core:0.8.38:tests"
|
37
|
+
}
|
38
|
+
|
39
|
+
compileKotlin {
|
40
|
+
kotlinOptions {
|
41
|
+
jvmTarget = "1.6" // only 1.6 and 1.8 are supported, we can't choose 1.7
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
compileTestKotlin {
|
46
|
+
kotlinOptions {
|
47
|
+
jvmTarget = "1.6" // only 1.6 and 1.8 are supported, we can't choose 1.7
|
48
|
+
}
|
49
|
+
}
|
data/gradle.properties
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
version=0.1.0
|
Binary file
|
data/gradlew
ADDED
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/usr/bin/env sh
|
2
|
+
|
3
|
+
##############################################################################
|
4
|
+
##
|
5
|
+
## Gradle start up script for UN*X
|
6
|
+
##
|
7
|
+
##############################################################################
|
8
|
+
|
9
|
+
# Attempt to set APP_HOME
|
10
|
+
# Resolve links: $0 may be a link
|
11
|
+
PRG="$0"
|
12
|
+
# Need this for relative symlinks.
|
13
|
+
while [ -h "$PRG" ] ; do
|
14
|
+
ls=`ls -ld "$PRG"`
|
15
|
+
link=`expr "$ls" : '.*-> \(.*\)$'`
|
16
|
+
if expr "$link" : '/.*' > /dev/null; then
|
17
|
+
PRG="$link"
|
18
|
+
else
|
19
|
+
PRG=`dirname "$PRG"`"/$link"
|
20
|
+
fi
|
21
|
+
done
|
22
|
+
SAVED="`pwd`"
|
23
|
+
cd "`dirname \"$PRG\"`/" >/dev/null
|
24
|
+
APP_HOME="`pwd -P`"
|
25
|
+
cd "$SAVED" >/dev/null
|
26
|
+
|
27
|
+
APP_NAME="Gradle"
|
28
|
+
APP_BASE_NAME=`basename "$0"`
|
29
|
+
|
30
|
+
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
|
31
|
+
DEFAULT_JVM_OPTS=""
|
32
|
+
|
33
|
+
# Use the maximum available, or set MAX_FD != -1 to use that value.
|
34
|
+
MAX_FD="maximum"
|
35
|
+
|
36
|
+
warn () {
|
37
|
+
echo "$*"
|
38
|
+
}
|
39
|
+
|
40
|
+
die () {
|
41
|
+
echo
|
42
|
+
echo "$*"
|
43
|
+
echo
|
44
|
+
exit 1
|
45
|
+
}
|
46
|
+
|
47
|
+
# OS specific support (must be 'true' or 'false').
|
48
|
+
cygwin=false
|
49
|
+
msys=false
|
50
|
+
darwin=false
|
51
|
+
nonstop=false
|
52
|
+
case "`uname`" in
|
53
|
+
CYGWIN* )
|
54
|
+
cygwin=true
|
55
|
+
;;
|
56
|
+
Darwin* )
|
57
|
+
darwin=true
|
58
|
+
;;
|
59
|
+
MINGW* )
|
60
|
+
msys=true
|
61
|
+
;;
|
62
|
+
NONSTOP* )
|
63
|
+
nonstop=true
|
64
|
+
;;
|
65
|
+
esac
|
66
|
+
|
67
|
+
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
|
68
|
+
|
69
|
+
# Determine the Java command to use to start the JVM.
|
70
|
+
if [ -n "$JAVA_HOME" ] ; then
|
71
|
+
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
|
72
|
+
# IBM's JDK on AIX uses strange locations for the executables
|
73
|
+
JAVACMD="$JAVA_HOME/jre/sh/java"
|
74
|
+
else
|
75
|
+
JAVACMD="$JAVA_HOME/bin/java"
|
76
|
+
fi
|
77
|
+
if [ ! -x "$JAVACMD" ] ; then
|
78
|
+
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
|
79
|
+
|
80
|
+
Please set the JAVA_HOME variable in your environment to match the
|
81
|
+
location of your Java installation."
|
82
|
+
fi
|
83
|
+
else
|
84
|
+
JAVACMD="java"
|
85
|
+
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
|
86
|
+
|
87
|
+
Please set the JAVA_HOME variable in your environment to match the
|
88
|
+
location of your Java installation."
|
89
|
+
fi
|
90
|
+
|
91
|
+
# Increase the maximum file descriptors if we can.
|
92
|
+
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
|
93
|
+
MAX_FD_LIMIT=`ulimit -H -n`
|
94
|
+
if [ $? -eq 0 ] ; then
|
95
|
+
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
|
96
|
+
MAX_FD="$MAX_FD_LIMIT"
|
97
|
+
fi
|
98
|
+
ulimit -n $MAX_FD
|
99
|
+
if [ $? -ne 0 ] ; then
|
100
|
+
warn "Could not set maximum file descriptor limit: $MAX_FD"
|
101
|
+
fi
|
102
|
+
else
|
103
|
+
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
|
104
|
+
fi
|
105
|
+
fi
|
106
|
+
|
107
|
+
# For Darwin, add options to specify how the application appears in the dock
|
108
|
+
if $darwin; then
|
109
|
+
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
|
110
|
+
fi
|
111
|
+
|
112
|
+
# For Cygwin, switch paths to Windows format before running java
|
113
|
+
if $cygwin ; then
|
114
|
+
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
|
115
|
+
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
|
116
|
+
JAVACMD=`cygpath --unix "$JAVACMD"`
|
117
|
+
|
118
|
+
# We build the pattern for arguments to be converted via cygpath
|
119
|
+
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
|
120
|
+
SEP=""
|
121
|
+
for dir in $ROOTDIRSRAW ; do
|
122
|
+
ROOTDIRS="$ROOTDIRS$SEP$dir"
|
123
|
+
SEP="|"
|
124
|
+
done
|
125
|
+
OURCYGPATTERN="(^($ROOTDIRS))"
|
126
|
+
# Add a user-defined pattern to the cygpath arguments
|
127
|
+
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
|
128
|
+
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
|
129
|
+
fi
|
130
|
+
# Now convert the arguments - kludge to limit ourselves to /bin/sh
|
131
|
+
i=0
|
132
|
+
for arg in "$@" ; do
|
133
|
+
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
|
134
|
+
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
|
135
|
+
|
136
|
+
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
|
137
|
+
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
|
138
|
+
else
|
139
|
+
eval `echo args$i`="\"$arg\""
|
140
|
+
fi
|
141
|
+
i=$((i+1))
|
142
|
+
done
|
143
|
+
case $i in
|
144
|
+
(0) set -- ;;
|
145
|
+
(1) set -- "$args0" ;;
|
146
|
+
(2) set -- "$args0" "$args1" ;;
|
147
|
+
(3) set -- "$args0" "$args1" "$args2" ;;
|
148
|
+
(4) set -- "$args0" "$args1" "$args2" "$args3" ;;
|
149
|
+
(5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
|
150
|
+
(6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
|
151
|
+
(7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
|
152
|
+
(8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
|
153
|
+
(9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
|
154
|
+
esac
|
155
|
+
fi
|
156
|
+
|
157
|
+
# Escape application args
|
158
|
+
save () {
|
159
|
+
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
|
160
|
+
echo " "
|
161
|
+
}
|
162
|
+
APP_ARGS=$(save "$@")
|
163
|
+
|
164
|
+
# Collect all arguments for the java command, following the shell quoting and substitution rules
|
165
|
+
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
|
166
|
+
|
167
|
+
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
|
168
|
+
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
|
169
|
+
cd "$(dirname "$0")"
|
170
|
+
fi
|
171
|
+
|
172
|
+
exec "$JAVACMD" "$@"
|