embulk-filter-distinct 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 5c300a014b0ed10c8ef0c71f17d1f7acde8a0f6e
4
+ data.tar.gz: 25680ed19adfddd53bb7cadaf4d538c9838ea53e
5
+ SHA512:
6
+ metadata.gz: 5f2ea1a43abe07442ec07c415536da6aa0202c89f73d5d28cfb53f44daf2eb12dc36e1d66376078a0f19adc13bd15f52cf9487db34d4500a27dd28730cd9036f
7
+ data.tar.gz: 061ab68e1a192c0e8df94625061b0a0f5be974fc7fc555dafabd58795496c3faf03f6a0c4890346f3c9b9c3025a33fef0c9ab232568386745e5616a968672f7c
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ *.gemspec
5
+ .gradle/
6
+ /classpath/
7
+ build/
8
+ .idea
9
+ /.settings/
10
+ /.metadata/
11
+ .classpath
12
+ .project
13
+ .idea
14
+ *.iml
data/CHANGELOG.md ADDED
@@ -0,0 +1,4 @@
1
+ 0.0.1 (2015-12-08)
2
+ ==================
3
+
4
+ - first version
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,42 @@
1
+ # Distinct filter plugin for Embulk
2
+
3
+ filter returns distinct records by columns you configured.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: filter
8
+
9
+ ## Configuration
10
+
11
+ - **columns**: column name list to distinguish records (array of string, required)
12
+
13
+ ## Example
14
+
15
+ ```yaml
16
+ filters:
17
+ - type: distinct
18
+ columns: [c0, c1]
19
+ ```
20
+
21
+ ## Run Example
22
+
23
+ ```
24
+ $ ./gradlew classpath
25
+ $ embulk run -I lib example/config.yml
26
+ ```
27
+
28
+ ## Note
29
+
30
+ this plugin uses a lot of memory because of having distinct column values.
31
+
32
+ ## TODO
33
+
34
+ - lessen further the amount of memory by filter. i.e. use crc32 of values as distinct key?
35
+ - want ideas!
36
+ - test
37
+
38
+ ## Build
39
+
40
+ ```
41
+ $ ./gradlew gem # -t to watch change of files and rebuild continuously
42
+ ```
data/build.gradle ADDED
@@ -0,0 +1,86 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ // For test/coverage
6
+ id "com.github.kt3k.coveralls" version "2.4.0"
7
+ id "jacoco"
8
+ }
9
+ import com.github.jrubygradle.JRubyExec
10
+ repositories {
11
+ mavenCentral()
12
+ jcenter()
13
+ }
14
+ configurations {
15
+ provided
16
+ }
17
+
18
+ version = "0.0.1"
19
+ sourceCompatibility = 1.7
20
+ targetCompatibility = 1.7
21
+
22
+ dependencies {
23
+ compile "org.embulk:embulk-core:0.7.+"
24
+ provided "org.embulk:embulk-core:0.7.+"
25
+ // compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
26
+ testCompile "junit:junit:4.+"
27
+ testCompile "org.embulk:embulk-core:0.7.+:tests"
28
+ }
29
+
30
+ jacocoTestReport {
31
+ reports {
32
+ xml.enabled = true // coveralls plugin depends on xml format report
33
+ html.enabled = true
34
+ }
35
+ }
36
+
37
+ task classpath(type: Copy, dependsOn: ["jar"]) {
38
+ doFirst { file("classpath").deleteDir() }
39
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
40
+ into "classpath"
41
+ }
42
+ clean { delete "classpath" }
43
+
44
+ task gem(type: JRubyExec, dependsOn: ["gemspec", "classpath"]) {
45
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
46
+ script "${project.name}.gemspec"
47
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
48
+ }
49
+
50
+ task gemPush(type: JRubyExec, dependsOn: ["gem"]) {
51
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "push"
52
+ script "pkg/${project.name}-${project.version}.gem"
53
+ }
54
+
55
+ task "package"(dependsOn: ["gemspec", "classpath"]) << {
56
+ println "> Build succeeded."
57
+ println "> You can run embulk with '-L ${file(".").absolutePath}' argument."
58
+ }
59
+
60
+ task gemspec {
61
+ ext.gemspecFile = file("${project.name}.gemspec")
62
+ inputs.file "build.gradle"
63
+ outputs.file gemspecFile
64
+ doLast { gemspecFile.write($/
65
+ Gem::Specification.new do |spec|
66
+ spec.name = "${project.name}"
67
+ spec.version = "${project.version}"
68
+ spec.authors = ["Civitaspo"]
69
+ spec.summary = %[Distinct filter plugin for Embulk]
70
+ spec.description = %[Distinct]
71
+ spec.email = ["civitaspo@gmail.com"]
72
+ spec.licenses = ["MIT"]
73
+ spec.homepage = "https://github.com/civitaspo/embulk-filter-distinct"
74
+
75
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
76
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
77
+ spec.require_paths = ["lib"]
78
+
79
+ #spec.add_dependency 'YOUR_GEM_DEPENDENCY', ['~> YOUR_GEM_DEPENDENCY_VERSION']
80
+ spec.add_development_dependency 'bundler', ['~> 1.0']
81
+ spec.add_development_dependency 'rake', ['>= 10.0']
82
+ end
83
+ /$)
84
+ }
85
+ }
86
+ clean { delete "${project.name}.gemspec" }
@@ -0,0 +1,27 @@
1
+ # in:
2
+ # type: random
3
+ # rows: 100
4
+ # schema:
5
+ # id: primary_key
6
+ # name: string
7
+ # score: integer
8
+ in:
9
+ type: file
10
+ path_prefix: example/data.csv
11
+ parser:
12
+ type: csv
13
+ charset: UTF-8
14
+ newline: CRLF
15
+ null_string: 'NULL'
16
+ skip_header_lines: 1
17
+ comment_line_marker: '#'
18
+ columns:
19
+ - {name: time, type: timestamp, format: "%Y-%m-%d"}
20
+ - {name: id, type: long}
21
+ - {name: name, type: string}
22
+ - {name: score, type: double}
23
+ filters:
24
+ - type: distinct
25
+ columns: [time]
26
+ out:
27
+ type: stdout
data/example/data.csv ADDED
@@ -0,0 +1,100 @@
1
+ time,id,name,score
2
+ 2015-07-13,0,Vqjht6YEUBsMPXmoW1iOGFROZF27pBzz0TUkOKeDXEY,1370
3
+ 2015-07-13,1,VmjbjAA0tOoSEPv_vKAGMtD_0aXZji0abGe7_VXHmUQ,3962
4
+ 2015-07-13,2,C40P5H1WcBx-aWFDJCI8th6QPEI2DOUgupt_gB8UutE,7323
5
+ 2015-07-13,3,Prr0_u_T1ts4myUofBorOJFpCYcOTLOmNBMuRmKIPJU,5905
6
+ 2015-07-13,4,AEGIhHVW5cV6Xlb62uvx3TVl3kmh3Do8AvvtLDS7MDw,8378
7
+ 2015-07-13,5,eupqWLrnCHr_1UaX4dUInLRxx5Q_cyQ4t0oSJBcw0MA,275
8
+ 2015-07-13,6,BN8cQ47EXRb_oCGOoN96bhBldoiyoCp5O_vGHwg0XCg,9303
9
+ 2015-07-13,7,RvV35-6jY6MC9_Wnm4nPsmyyfqcr-hlnBt88sXtn1nU,6130
10
+ 2015-07-13,8,6OZiuPiJKjWNLMPgiEbJarB0F80lTPYkkePP8LMliv0,6652
11
+ 2015-07-13,9,13CgEU_ApAMVE6Ll6Y-mSu-aubskNgHbynj2rj8f8oE,6822
12
+ 2015-07-13,10,j1evoWRzKrJR0sfo014ZxhZtKigWDkRip5FwpAHAsmU,1311
13
+ 2015-07-13,11,4vBBBcArfMGhediXV5Sn80hj4KkI4nUCllECNKxNgnI,4748
14
+ 2015-07-13,12,6LSLQGjv46TWsvXrxYCfM5yIz4JGiGd1eEQI4TC-4yc,43
15
+ 2015-07-13,13,bgLJeacIPOMH6sDb5tEmca1oYyaMdfqZomGEI2uby7k,1214
16
+ 2015-07-13,14,bRHc-42RqKVv3ORxhVCA4T4dLEXyBzBCQoed8VOrDCo,7048
17
+ 2015-07-13,15,ysiB3w-K5jb3FxpQY61OHYTlK9qklz3nW84RLvBnh9s,8795
18
+ 2015-07-13,16,Rvn7-tMbQM3q0yWQD8AUdURhFB0ZkzLGdIiDg-AJokM,7838
19
+ 2015-07-13,17,FDEI99QVJ8xRTOiQ-UDVlPMOBfuA0IwIAbJ872XnKOo,9507
20
+ 2015-07-13,18,lZUazYHDEGbQbzN7vEFeLjmnzp1wsjR0D8r8f7Cs6x0,3378
21
+ 2015-07-13,19,WmDFEQsDPSnVs8AiAdO3QJqlSFer1K0I8z7F0cl_WRk,1661
22
+ 2015-07-13,20,OEDSi7YIj4OjMNqTw12EA04BNtNuVWva6YRhokxL4xQ,5934
23
+ 2015-07-13,21,fXYhm19m2FsbWcRQGqJvVOSl2ZIRSNhWTfke-iG8e7Q,680
24
+ 2015-07-13,22,LK59zfxizCwr5CI2Wu88B8gY8-G4OeyAXZobplwGzKk,8758
25
+ 2015-07-13,23,8i5TVZorCp4YATsaxgybkdOHcmDywvb35Sf-Eb-sl9E,8392
26
+ 2015-07-13,24,MrM9vy1U-9_OEYOQAxbshenvvUGdCZfqjx7l3KKBQ2I,8708
27
+ 2015-07-13,25,miVWwEwur_7baTxIBHUT9y351AU3tnAcCXgBzvyUR5I,2843
28
+ 2015-07-13,26,_vxViqC02KVb7RRBeDGYs9VZ52KB8QmvguzSXUYGfwI,6681
29
+ 2015-07-13,27,Ui6BqkQDipo5kQEeVUuC2OFFIB1O4T8ALlM2GI_zvtk,7542
30
+ 2015-07-13,28,OT3VLH-RdK0sIgQM3f6LIbBa_rt0YzCD5YOw4qpu6p8,5791
31
+ 2015-07-13,29,vassmNeEo_jbn88g7QP58mTxH-b1jhHfwFhy-FL6T8c,9613
32
+ 2015-07-13,30,VjzTphngC6V5fphi9fkGeYGCPIQNpDajfkHxrJopF6k,3064
33
+ 2015-07-13,31,aqw27tMVvSsLJ8EEY3hphHMb0BRLm-LZysjVV3aX7pQ,7862
34
+ 2015-07-13,32,ZXepGbCv7Yw_ejNQyAPjrqG_VwNH_RZoG8lKODl-f9c,397
35
+ 2015-07-13,33,-yRoubVSa0oPfg0E1Gh7zYBQfBO8dIxZvQH9c5OsZAU,5003
36
+ 2015-07-13,34,UkhBEKU7G0rV58Urs6JTAgC0UF5Y2kP-dffmE6H4nGs,6514
37
+ 2015-07-13,35,ktLO3RTpHLZon7AhE9XMwPPh0t_GiOpS8vwCCqoPPnk,8634
38
+ 2015-07-13,36,3ktjc_W87j3S8qLOJ0CVEVSSpz_nUAEQVBsqOMabrp0,3679
39
+ 2015-07-13,37,KscV-oPqhG_CZXYUgdCmekKdR9FIT5tSt7rd3wpQDcU,1013
40
+ 2015-07-13,38,VFiC8YyBk6zZk5bpfZG8s1a3kYfMA1zvnbs6DDSplGY,1556
41
+ 2015-07-13,39,s0bxCQyW048GkhEAoEzXYGcTV8BZo6MLnRhL62nepYk,2844
42
+ 2015-07-13,40,aWbMyvSxxTqrVONKAeQVvqi_bGqROu9UeR5NqPPlI4A,8035
43
+ 2015-07-13,41,qfjEvEY8XSgMEmc-vIZLinOeIdIz6xprQbsYAe0i2WU,3205
44
+ 2015-07-13,42,NK2ddaghTrUTS6Y7U1e-l57922ccVOKnqlODcA6lyBQ,302
45
+ 2015-07-13,43,JRQpF1luRmNk2stUaZzDQDj93hy4RSW_iWybVgsgzJA,6534
46
+ 2015-07-13,44,lz7bs1xZi4qdWLE7fQwpykWDNgp_o9oUuCZXipSLSqw,9250
47
+ 2015-07-13,45,TxcwVGwelHKJws_6Q0Nk6I4Eeo9sSThM7M9KorqIGhA,5549
48
+ 2015-07-13,46,u_uy6k3TgUIp3NWMFJ8EOH1mKtFhozGBD208z9um88s,3624
49
+ 2015-07-13,47,RaI9xr82f0D7Jjuc4QY8Rz-UlCg3V5tw7KgJtczEo44,5278
50
+ 2015-07-13,48,u6Nqudxl6vrbKGemO8xXgYojhtBGK3SQkTRPSYcaZuI,9588
51
+ 2015-07-13,49,r-IgfD4fE9TiQWarsVxO_4AdieYIUZ9cczPD44_snQ4,4795
52
+ 2015-07-13,50,KIiUpd04d3zYDul1mFlcJ3934AYvA_YeXDYG089ub-M,4344
53
+ 2015-07-13,51,zZs0iuqm7liPKKHHn8wz-kNvd2zLCqRdXAng4B3gL0A,2116
54
+ 2015-07-13,52,Rg7T2IsH0-HIvhgq0mNRC-4q5JoZ5Rcjq4tP7dz_3Ew,5323
55
+ 2015-07-13,53,uBNgdXPL6kZGXP-gTic2N-uDRCxAtmI-KixkJWgrObA,9
56
+ 2015-07-13,54,fQ_TLG3oByt3sDqM3Kruo69fBd1qLMXbbg10myfFXkQ,2471
57
+ 2015-07-13,55,0uNd4TrRpEA1lY_zWikyELZ3MmCTzON_5ftfi-45wic,9831
58
+ 2015-07-13,56,Jfp4VCtsFElA6UzzZyPxOwegfGqsYwrimSFp59YshTs,3177
59
+ 2015-07-13,57,KAHSwcCwblbPRysuImbzUxx0SLAMIMb6LmMAXJBjUww,1182
60
+ 2015-07-13,58,wuyDbV5ljr5275eGWhAe8wkElCzd2d_gRW3SpBkLIyY,183
61
+ 2015-07-13,59,R3KTTvKRvPn6vu4qtooBbqYmwdOCC9vjmcsnf_fyu5g,5001
62
+ 2015-07-13,60,Pgsf32JIv2cUMdTE9Vydh2Y36B_Xi4T1ufIy7QiKFSU,6182
63
+ 2015-07-13,61,EZmz-tWhPPAsXsDZms_HHsDLKBOuZisUDotr72xXQnI,5228
64
+ 2015-07-13,62,mk4y32O73DU2z65dFuW1PvIokdB7bB7btUnCoDlSVxM,8094
65
+ 2015-07-13,63,fs1HvYjpOvAHnT5W1rCPU9A3k8_Px2XwfprrLrkQibM,5849
66
+ 2015-07-13,64,x8WAAde6AqG2YaOEIpCFMzItRrfUXqgc8bwcoWSiMEo,6076
67
+ 2015-07-13,65,zuvlwNyn8AgPEvg6qIxzkUp_ClPkMn5A__YyksWbxTo,6439
68
+ 2015-07-13,66,ZWWjbJAqVtZz3AzCpacgEabm7SMloLHPBTlS3NMk7GA,6531
69
+ 2015-07-13,67,wdHfAVpHp9rFaGhZOC81AusTsZX0KHxTf5RkFBw6gpI,8088
70
+ 2015-07-13,68,hw8HUkIQMSS-2gAT7rvA2kgdhXfhHlySxKtssINvcFc,7808
71
+ 2015-07-13,69,x1_SLENL6-M1y0n5qmfBF1-GCslEHpVM4Fo1Rdz9Ofg,3617
72
+ 2015-07-13,70,E2Uj3TGAwd_B3FOS6KQ1Gjyql2YpoNtbdzWBTUOWmxY,8401
73
+ 2015-07-13,71,WkpwSIP4fA42gYd1H3ohw7EmtqdQSqh4ooA7aX8v_7o,1309
74
+ 2015-07-13,72,xDdMCHpSKFSZWQBJJgNzNh1R4hXouCsUfKFZpio5cgY,7867
75
+ 2015-07-13,73,l0QVMlih2NmGSajDXytku9Em9p61erNKe1LEyk1VZ-Q,7964
76
+ 2015-07-13,74,R8G5juHaD9sit1oujjp4FoXzXJT7hdIjEY3Lhu-ep6o,5680
77
+ 2015-07-13,75,Ckpy3y166odB33VVWb27XNG_Wi65_qyikeL7dGHceSE,8603
78
+ 2015-07-13,76,elFu5tPgUNzhuyswgr3QS7TXR2fInI4PWVZIEffxq6c,4972
79
+ 2015-07-13,77,kz663CgkMh9VfcJrfMZb735vJJWYUPAuaskNeg7xRDk,8396
80
+ 2015-07-13,78,evuBVl0RR1XQfJHN4jxSBpLcKxjZ7RtpDGYrU2ONYZA,6433
81
+ 2015-07-13,79,ZbIJwmWRWscOurtrCam-iLB2mIqREwQwGFRfVYzGxwk,2917
82
+ 2015-07-13,80,mzCWiiJFzo1R_anxGFALosK0eKvGfv_RT7iRGZnL790,3162
83
+ 2015-07-13,81,JyrXoXLq5RpRwwXNpiW1NFK6ZkVmS55hJsNBGsuY7xY,2385
84
+ 2015-07-13,82,fO7A_MQGh3Zojp6HlVZayvJHLu_RQ082ix3Y6BlRCu0,5965
85
+ 2015-07-13,83,ib-pOMBLU1sN5fyyJbAElIdWEJgkoqRcBuwo6CVVYsk,3265
86
+ 2015-07-13,84,X_6Ren6P7TpqyiWViO72kEwIulMqbTU_v8eAGfEo8k0,8049
87
+ 2015-07-13,85,hNI30i9IYx7EreMyG7rI56Y-ZtrRe4sBYjzKMnSrL5I,9222
88
+ 2015-07-13,86,kzokOacUOXELAeIHfPbnl-Er8rnHYq2JnksqN1roOSQ,2972
89
+ 2015-07-13,87,qKIfkhQObWMadIi5vshcDRv95je4TYcAPSYITfwVTRk,5390
90
+ 2015-07-13,88,9xKf3bfWj8Gr1NNocYHZuL0kIkAVD750LCMYDZ-R1tA,4759
91
+ 2015-07-13,89,ohbmpvNy7aaaIVZ74SlHSfm0ffdwV-AqJP1bfDSjNUU,2279
92
+ 2015-07-13,90,l6lTsvxdlcTfcqx2c0lQSd9HejVQg40W25f0wGNQViY,9034
93
+ 2015-07-13,91,XoALSEQg9ycuGqrEWHOb8vdrLbheZSgFO53Wr3mciXY,3945
94
+ 2015-07-13,92,0hgDRI_mijs5w7rkiLIe__LEayOOLxL0qVT1IHa5QBw,8109
95
+ 2015-07-13,93,KjCRAc-AVcS-R13toBUR6pK_7d9Y8Gl4TRdYYMaSirc,4774
96
+ 2015-07-13,94,fyQVGlT8Bqmu_LiajPlgfbmavoNyAqXaBsBP_e4OnN8,7253
97
+ 2015-07-13,95,FpBYRPWKu6DmLpx5tsB25URWfj3sNCbcydNAXULaiD8,3166
98
+ 2015-07-13,96,9ikvnUqp1Rf2yVwLvs5bBvxQP-KyqxGi4gZRSZ8c1d4,3695
99
+ 2015-07-13,97,RRNYDAzKaq4Trtt96Bxgk3N0fXLIV8hXoK0qQ7uw_Wc,5065
100
+ ,,,9170
Binary file
@@ -0,0 +1,6 @@
1
+ #Tue Aug 11 00:26:20 PDT 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.6-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_filter(
2
+ "distinct", "org.embulk.filter.distinct.DistinctFilterPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,69 @@
1
+ package org.embulk.filter.distinct;
2
+
3
+ import org.embulk.spi.Column;
4
+ import org.embulk.spi.ColumnVisitor;
5
+ import org.embulk.spi.Exec;
6
+ import org.embulk.spi.PageBuilder;
7
+ import org.embulk.spi.PageReader;
8
+ import org.slf4j.Logger;
9
+
10
+ /**
11
+ * Created by takahiro.nakayama on 12/6/15.
12
+ */
13
+ class ColumnVisitorImpl
14
+ implements ColumnVisitor
15
+ {
16
+ private final static Logger logger = Exec.getLogger(ColumnVisitorImpl.class);
17
+ private final PageReader pageReader;
18
+ private final PageBuilder pageBuilder;
19
+
20
+ ColumnVisitorImpl(PageReader pageReader, PageBuilder pageBuilder) {
21
+ this.pageReader = pageReader;
22
+ this.pageBuilder = pageBuilder;
23
+ }
24
+
25
+ @Override
26
+ public void booleanColumn(Column outputColumn) {
27
+ if (pageReader.isNull(outputColumn)) {
28
+ pageBuilder.setNull(outputColumn);
29
+ } else {
30
+ pageBuilder.setBoolean(outputColumn, pageReader.getBoolean(outputColumn));
31
+ }
32
+ }
33
+
34
+ @Override
35
+ public void longColumn(Column outputColumn) {
36
+ if (pageReader.isNull(outputColumn)) {
37
+ pageBuilder.setNull(outputColumn);
38
+ } else {
39
+ pageBuilder.setLong(outputColumn, pageReader.getLong(outputColumn));
40
+ }
41
+ }
42
+
43
+ @Override
44
+ public void doubleColumn(Column outputColumn) {
45
+ if (pageReader.isNull(outputColumn)) {
46
+ pageBuilder.setNull(outputColumn);
47
+ } else {
48
+ pageBuilder.setDouble(outputColumn, pageReader.getDouble(outputColumn));
49
+ }
50
+ }
51
+
52
+ @Override
53
+ public void stringColumn(Column outputColumn) {
54
+ if (pageReader.isNull(outputColumn)) {
55
+ pageBuilder.setNull(outputColumn);
56
+ } else {
57
+ pageBuilder.setString(outputColumn, pageReader.getString(outputColumn));
58
+ }
59
+ }
60
+
61
+ @Override
62
+ public void timestampColumn(Column outputColumn) {
63
+ if (pageReader.isNull(outputColumn)) {
64
+ pageBuilder.setNull(outputColumn);
65
+ } else {
66
+ pageBuilder.setTimestamp(outputColumn, pageReader.getTimestamp(outputColumn));
67
+ }
68
+ }
69
+ }
@@ -0,0 +1,79 @@
1
+ package org.embulk.filter.distinct;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import org.embulk.config.Config;
5
+ import org.embulk.config.ConfigDefault;
6
+ import org.embulk.config.ConfigException;
7
+ import org.embulk.config.ConfigInject;
8
+ import org.embulk.config.ConfigSource;
9
+ import org.embulk.config.Task;
10
+ import org.embulk.config.TaskSource;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FilterPlugin;
14
+ import org.embulk.spi.PageOutput;
15
+ import org.embulk.spi.Schema;
16
+ import org.slf4j.Logger;
17
+
18
+ import java.util.List;
19
+
20
+ public class DistinctFilterPlugin
21
+ implements FilterPlugin
22
+ {
23
+ private final static Logger logger = Exec.getLogger(DistinctFilterPlugin.class);
24
+
25
+ public interface PluginTask
26
+ extends Task
27
+ {
28
+ @Config("columns")
29
+ public List<String> getDistinctColumnNames();
30
+
31
+ @ConfigInject
32
+ public void setDistinctColumns(List<Column> columns);
33
+ public List<Column> getDistinctColumns();
34
+ }
35
+
36
+ @Override
37
+ public void transaction(ConfigSource config, Schema inputSchema,
38
+ FilterPlugin.Control control)
39
+ {
40
+ PluginTask task = config.loadConfig(PluginTask.class);
41
+
42
+ List<Column> distinctColumns = convertNameToColumn(inputSchema, task.getDistinctColumnNames());
43
+ task.setDistinctColumns(distinctColumns);
44
+
45
+ if (task.getDistinctColumns().isEmpty()) {
46
+ throw new ConfigException(
47
+ "inputSchema does not have any columns you configured.");
48
+ }
49
+ else {
50
+ logger.debug("distinct columns: {}", task.getDistinctColumns());
51
+ }
52
+
53
+ Schema outputSchema = inputSchema;
54
+ control.run(task.dump(), outputSchema);
55
+ }
56
+
57
+ @Override
58
+ public PageOutput open(TaskSource taskSource, final Schema inputSchema,
59
+ final Schema outputSchema, final PageOutput output)
60
+ {
61
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
62
+ return new FilteredPageOutput(task, inputSchema,
63
+ outputSchema, output);
64
+ }
65
+
66
+ private List<Column> convertNameToColumn(Schema inputSchema, List<String> columnNames)
67
+ {
68
+ ImmutableList.Builder<Column> builder = ImmutableList.builder();
69
+ for (String columnName : columnNames) {
70
+ for (Column column : inputSchema.getColumns()) {
71
+ if (columnName.contentEquals(column.getName())) {
72
+ builder.add(column);
73
+ }
74
+ }
75
+ }
76
+
77
+ return builder.build();
78
+ }
79
+ }
@@ -0,0 +1,100 @@
1
+ package org.embulk.filter.distinct;
2
+
3
+ import com.google.common.base.Strings;
4
+ import com.google.common.collect.ImmutableList;
5
+ import com.google.common.collect.ObjectArrays;
6
+ import com.google.common.collect.Sets;
7
+ import org.embulk.filter.distinct.DistinctFilterPlugin.PluginTask;
8
+ import org.embulk.spi.Column;
9
+ import org.embulk.spi.Exec;
10
+ import org.embulk.spi.Page;
11
+ import org.embulk.spi.PageBuilder;
12
+ import org.embulk.spi.PageOutput;
13
+ import org.embulk.spi.PageReader;
14
+ import org.embulk.spi.Schema;
15
+ import org.embulk.spi.type.Types;
16
+ import org.slf4j.Logger;
17
+
18
+ import java.util.List;
19
+ import java.util.Set;
20
+
21
+ /**
22
+ * Created by takahiro.nakayama on 12/6/15.
23
+ */
24
+ class FilteredPageOutput
25
+ implements PageOutput
26
+ {
27
+ private final static Logger logger = Exec.getLogger(FilteredPageOutput.class);
28
+ private final PageReader pageReader;
29
+ private final PageBuilder pageBuilder;
30
+ private final ColumnVisitorImpl visitor;
31
+ private final Schema outputSchema;
32
+ private final List<Column> distinctColumns;
33
+
34
+ private final static Set<List<Object>> filter = Sets.newConcurrentHashSet();
35
+
36
+ FilteredPageOutput(PluginTask task, Schema inputSchema,
37
+ Schema outputSchema, PageOutput pageOutput)
38
+ {
39
+ this.pageReader = new PageReader(inputSchema);
40
+ this.pageBuilder = new PageBuilder(Exec.getBufferAllocator(), outputSchema, pageOutput);
41
+ this.visitor = new ColumnVisitorImpl(pageReader, pageBuilder);
42
+ this.outputSchema = outputSchema;
43
+ this.distinctColumns = task.getDistinctColumns();
44
+ }
45
+
46
+ @Override
47
+ public void add(Page page)
48
+ {
49
+ pageReader.setPage(page);
50
+
51
+ while (pageReader.nextRecord()) {
52
+ if (filter.add(getCurrentDistinctKey())) {
53
+ outputSchema.visitColumns(visitor);
54
+ pageBuilder.addRecord();
55
+ }
56
+ }
57
+ }
58
+
59
+ @Override
60
+ public void finish()
61
+ {
62
+ pageBuilder.finish();
63
+ }
64
+
65
+ @Override
66
+ public void close()
67
+ {
68
+ pageReader.close();
69
+ pageBuilder.close();
70
+ }
71
+
72
+ private List<Object> getCurrentDistinctKey()
73
+ {
74
+ ImmutableList.Builder<Object> builder = ImmutableList.builder();
75
+ for (Column distinctColumn : distinctColumns) {
76
+ if (!pageReader.isNull(distinctColumn)) {
77
+ if (Types.BOOLEAN.equals(distinctColumn.getType())) {
78
+ builder.add(pageReader.getBoolean(distinctColumn));
79
+ }
80
+ else if (Types.DOUBLE.equals(distinctColumn.getType())) {
81
+ builder.add(pageReader.getDouble(distinctColumn));
82
+ }
83
+ else if (Types.LONG.equals(distinctColumn.getType())) {
84
+ builder.add(pageReader.getLong(distinctColumn));
85
+ }
86
+ else if (Types.STRING.equals(distinctColumn.getType())) {
87
+ builder.add(pageReader.getString(distinctColumn));
88
+ }
89
+ else if (Types.TIMESTAMP.equals(distinctColumn.getType())) {
90
+ builder.add(pageReader.getTimestamp(distinctColumn));
91
+ }
92
+ else {
93
+ throw new RuntimeException("unsupported type: " + distinctColumn.getType());
94
+ }
95
+ }
96
+ }
97
+
98
+ return builder.build();
99
+ }
100
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.filter.distinct;
2
+
3
+ public class TestDistinctFilterPlugin
4
+ {
5
+ }
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-filter-distinct
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Civitaspo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-12-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: '1.0'
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '10.0'
39
+ prerelease: false
40
+ type: :development
41
+ description: Distinct
42
+ email:
43
+ - civitaspo@gmail.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - .gitignore
49
+ - CHANGELOG.md
50
+ - LICENSE.txt
51
+ - README.md
52
+ - build.gradle
53
+ - example/config.yml
54
+ - example/data.csv
55
+ - gradle/wrapper/gradle-wrapper.jar
56
+ - gradle/wrapper/gradle-wrapper.properties
57
+ - gradlew
58
+ - gradlew.bat
59
+ - lib/embulk/filter/distinct.rb
60
+ - src/main/java/org/embulk/filter/distinct/ColumnVisitorImpl.java
61
+ - src/main/java/org/embulk/filter/distinct/DistinctFilterPlugin.java
62
+ - src/main/java/org/embulk/filter/distinct/FilteredPageOutput.java
63
+ - src/test/java/org/embulk/filter/distinct/TestDistinctFilterPlugin.java
64
+ - classpath/embulk-filter-distinct-0.0.1.jar
65
+ homepage: https://github.com/civitaspo/embulk-filter-distinct
66
+ licenses:
67
+ - MIT
68
+ metadata: {}
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubyforge_project:
85
+ rubygems_version: 2.1.9
86
+ signing_key:
87
+ specification_version: 4
88
+ summary: Distinct filter plugin for Embulk
89
+ test_files: []