orefine 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ Gemfile.lock
2
+ pkg/
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Michael Bianco
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'orefine'
4
+ require 'slop'
5
+
6
+ include Orefine
7
+
8
+ $opts = Slop.parse do
9
+ banner 'Usage: refine.rb csv_a csv_b [options]'
10
+
11
+ on 'output-columns=', 'List of columns you want in the resulting csv', as: Array
12
+ on 'delete-columns=', 'What columns to delete from the output', as: Array
13
+ on 'merge=', 'What column to merge in from csv_b', as: Array
14
+ on 'add-static-column=', 'Add a column with a static value (input: key, value)', as: Array
15
+ on 'merge-common=', 'Merge values from common columns with a comma'
16
+
17
+ on 'diff', 'only output rows in csv_a whose email does not exist in csv_b'
18
+ on 'common', 'only output rows common to both csvs'
19
+ on 'split-full-name', 'split a full name field into first and last'
20
+
21
+ on 'open', 'open the document in a web browser'
22
+ on 'stdout', 'write the resulting csv to stdout'
23
+ end
24
+
25
+ if ARGV.empty?
26
+ puts $opts.help
27
+ exit
28
+ end
29
+
30
+
31
+ csv_a_path = ARGV[0]
32
+ csv_b_path = ARGV[1]
33
+
34
+ # TODO clear out all old CSV a & b or timestamp the new ones
35
+
36
+ CSVUtil.clear_all_csvs
37
+ csv_a = Refine.new("project_name" => 'csv_a', "file_name" => csv_a_path)
38
+ csv_b = Refine.new("project_name" => 'csv_b', "file_name" => csv_b_path) if !csv_b_path.nil? && File.exists?(csv_b_path)
39
+
40
+ all_csvs = [csv_a]
41
+ all_csvs << csv_b if !csv_b.nil?
42
+
43
+ CSVUtil.normalize_column_names(all_csvs)
44
+ CSVUtil.normalize_email_column_content(all_csvs)
45
+ CSVUtil.create_common_flag(csv_a, csv_b) if !csv_b.nil?
46
+
47
+ CSVUtil.split_full_name(csv_a) if !$opts['split-full-name'].nil?
48
+
49
+ if !$opts['merge'].nil?
50
+ $opts['merge'].each do |merge_field|
51
+ CSVUtil.merge_field(csv_a, csv_b, merge_field)
52
+ end
53
+ end
54
+
55
+ if !$opts['merge-common'].nil?
56
+ $opts['merge-common'].each do |common_merge|
57
+ CSVUtil.merge_common_field(csv_a, csv_b, common_merge)
58
+ end
59
+ end
60
+
61
+ if !$opts['add-static-column'].nil?
62
+ CSVUtil.add_column(csv_a, $opts['add-static-column'].first, $opts['add-static-column'].last)
63
+ end
64
+
65
+ output_params = { "format" => "csv" }
66
+
67
+ if !$opts['output-columns'].nil?
68
+ output_params["options"] ||= {}
69
+ output_params["options"]["columns"] ||= []
70
+
71
+ $opts['output-columns'].each do |c|
72
+ output_params["options"]["columns"] << { "name" => c }
73
+ end
74
+ end
75
+
76
+ if !$opts['delete-columns'].nil?
77
+ $opts['delete-columns'].each do |column_name|
78
+ CSVUtil.delete_column(csv_a, column_name)
79
+ end
80
+ end
81
+
82
+ if !csv_b.nil? && ($opts.diff? || $opts.common?)
83
+ flag = true
84
+ flag = false if $opts.diff?
85
+
86
+ output_params["facets"] = [ CSVUtil.common_facet(flag) ]
87
+ end
88
+
89
+ puts csv_a.export_rows(output_params) if $opts.stdout?
90
+
91
+ `open "http://127.0.0.1:3333/project?project=#{csv_a.project_id}"` if $opts.open?
@@ -0,0 +1,234 @@
1
+ require "orefine/version"
2
+ require 'google-refine'
3
+
4
+ module Orefine
5
+ class CSVUtil
6
+ class << self
7
+ def clear_all_csvs
8
+ Refine.get_all_project_metadata["projects"]
9
+ .select { |k, v| v["name"].start_with?('csv_') }
10
+ .keys
11
+ .map { |project_id| Refine.new("project_id" => project_id) }
12
+ .map(&:delete_project)
13
+ end
14
+
15
+ def normalize_column_names(projects)
16
+ self.normalize_email_column_name(projects)
17
+ self.normalize_zip_column_name(projects)
18
+ self.normalize_full_name_column_name(projects)
19
+ end
20
+
21
+ def normalize_email_column_name(projects)
22
+ self.perform_operation(projects, %q{
23
+ [
24
+ {
25
+ "op": "core/column-rename",
26
+ "oldColumnName": "E-mail",
27
+ "newColumnName": "email"
28
+ },
29
+ {
30
+ "op": "core/column-rename",
31
+ "oldColumnName": "Email Address",
32
+ "newColumnName": "email"
33
+ },
34
+ {
35
+ "op": "core/column-rename",
36
+ "oldColumnName": "Email",
37
+ "newColumnName": "email"
38
+ },
39
+ {
40
+ "op": "core/column-rename",
41
+ "oldColumnName": "[email]",
42
+ "newColumnName": "email"
43
+ },
44
+ {
45
+ "op": "core/column-rename",
46
+ "oldColumnName": "email_stripped",
47
+ "newColumnName": "email"
48
+ },
49
+ ]
50
+ })
51
+ end
52
+
53
+ def normalize_zip_column_name(projects)
54
+ self.perform_operation(projects, %q{
55
+ [
56
+ {
57
+ "op": "core/column-rename",
58
+ "oldColumnName": "Zip",
59
+ "newColumnName": "zip"
60
+ },
61
+ {
62
+ "op": "core/column-rename",
63
+ "oldColumnName": "[zip]",
64
+ "newColumnName": "zip"
65
+ }
66
+ ]
67
+ })
68
+ end
69
+
70
+ def normalize_full_name_column_name(projects)
71
+ self.perform_operation(projects, %q{
72
+ [
73
+ {
74
+ "op": "core/column-rename",
75
+ "oldColumnName": "Name",
76
+ "newColumnName": "full_name"
77
+ },
78
+ {
79
+ "op": "core/column-rename",
80
+ "oldColumnName": "Full Name",
81
+ "newColumnName": "full_name"
82
+ }
83
+ ]
84
+ })
85
+ end
86
+
87
+ def normalize_email_column_content(projects)
88
+ self.perform_operation(projects, %q{
89
+ [
90
+ {
91
+ "op": "core/column-addition",
92
+ "engineConfig": {
93
+ "facets": [],
94
+ "mode": "record-based"
95
+ },
96
+ "newColumnName": "email_stripped",
97
+ "columnInsertIndex": 0,
98
+ "baseColumnName": "email",
99
+ "expression": "grel:strip(value.toLowercase())",
100
+ "onError": "set-to-blank"
101
+ }
102
+ ]
103
+ })
104
+ end
105
+
106
+ def split_full_name(projects)
107
+ self.perform_operation(projects, %q{
108
+ [
109
+ {
110
+ "op": "core/column-split",
111
+ "description": "Split column Name by separator",
112
+ "engineConfig": {
113
+ "facets": [],
114
+ "mode": "row-based"
115
+ },
116
+ "columnName": "full_name",
117
+ "guessCellType": false,
118
+ "removeOriginalColumn": false,
119
+ "mode": "separator",
120
+ "separator": "(?<=[a-z]) ",
121
+ "regex": true,
122
+ "maxColumns": 2
123
+ }
124
+ ]
125
+ })
126
+ end
127
+
128
+ def create_common_flag(project_a, project_b)
129
+ if project_a.get_columns_info.map { |c| c["name"] }.include? 'exists'
130
+ STDERR.puts "'exists' column already exists in csv_a, deleting"
131
+ self.delete_column(project_a, "exists")
132
+ end
133
+
134
+ self.perform_operation(project_a, %Q{
135
+ [
136
+ {
137
+ "op": "core/column-addition",
138
+ "engineConfig": {
139
+ "facets": [],
140
+ "mode": "record-based"
141
+ },
142
+ "newColumnName": "exists",
143
+ "columnInsertIndex": 0,
144
+ "baseColumnName": "email_stripped",
145
+ "expression": "grel:cell.cross(\\\"#{project_b.project_name}\\\", \\\"email_stripped\\\").cells.length() > 0",
146
+ "onError": "set-to-blank"
147
+ }
148
+ ]
149
+ })
150
+ end
151
+
152
+ def merge_field(project_a, project_b, field)
153
+ self.perform_operation(project_a, %Q{
154
+ [
155
+ {
156
+ "op": "core/column-addition",
157
+ "engineConfig": {
158
+ "facets": [],
159
+ "mode": "record-based"
160
+ },
161
+ "newColumnName": "#{field}_merged",
162
+ "columnInsertIndex": 0,
163
+ "baseColumnName": "email_stripped",
164
+ "expression": "grel:cell.cross(\\\"#{project_b.project_name}\\\", \\\"email_stripped\\\").cells[\\\"#{field}\\\"].value[0]",
165
+ "onError": "set-to-blank"
166
+ }
167
+ ]
168
+ })
169
+ end
170
+
171
+ def merge_common_field(csv_a, csv_b, common_field)
172
+
173
+ end
174
+
175
+ def delete_column(csv, field)
176
+ self.perform_operation(csv, %Q{
177
+ [
178
+ {
179
+ "op": "core/column-removal",
180
+ "columnName": "#{field}"
181
+ }
182
+ ]
183
+ })
184
+ end
185
+
186
+ def add_column(csv, field, value)
187
+ self.perform_operation(csv, %Q{
188
+ [
189
+ {
190
+ "op": "core/column-addition",
191
+ "engineConfig": {
192
+ "facets": [],
193
+ "mode": "record-based"
194
+ },
195
+ "newColumnName": "#{field}",
196
+ "columnInsertIndex": 0,
197
+ "baseColumnName": "email_stripped",
198
+ "expression": "grel:\\\"#{value}\\\"",
199
+ "onError": "set-to-blank"
200
+ }
201
+ ]
202
+ })
203
+ end
204
+
205
+ def common_facet(flag = true)
206
+ {
207
+ "invert" => false,
208
+ "expression" => "value",
209
+ "selectError" => false,
210
+ "omitError" => false,
211
+ "selectBlank" => false,
212
+ "name" => "exists",
213
+ "omitBlank" => false,
214
+ "columnName" => "exists",
215
+ "type" => "list",
216
+ "selection" => [
217
+ {
218
+ "v" => {
219
+ # string vs boolean matters here... be careful
220
+ "v" => flag,
221
+ "l" => flag,
222
+ }
223
+ }
224
+ ]
225
+ }
226
+ end
227
+
228
+ def perform_operation(projects, operation)
229
+ projects = [projects] if !projects.is_a?(Array)
230
+ projects.each { |p| p.apply_operations(operation) }
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,3 @@
1
+ module Orefine
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'orefine/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "orefine"
8
+ s.version = Orefine::VERSION
9
+ s.authors = ["Michael Bianco"]
10
+ s.email = ["info@cliffsidedev.com"]
11
+ s.description = %q{Easily modify CSVs from the command line using Open Refine}
12
+ s.summary = %q{Easily modify CSVs from the command line using Open Refine}
13
+ s.homepage = "http://github.com/iloveitaly/orefine"
14
+ s.license = "MIT"
15
+
16
+ s.files = `git ls-files`.split($/)
17
+ s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_dependency 'google-refine', '~> 0.1.1'
22
+ s.add_dependency 'slop', '~> 3.4.6'
23
+
24
+ s.add_development_dependency "bundler", "~> 1.3"
25
+ s.add_development_dependency "rake"
26
+ end
@@ -0,0 +1,36 @@
1
+ # ORefine - CLI for Working With Open Refine
2
+ Makes working with CSVs a bit less painful. Tries to automate some common operations. Contributions welcome, this is a rough cut with the minimal features I needed to get a job done.
3
+
4
+ You'll need [OpenRefine](https://github.com/OpenRefine/OpenRefine) installed & running.
5
+
6
+ ## Examples
7
+ ```
8
+ # output a list of stripped emails from a CSV
9
+ orefine a_list_of_emails_and_other_columns.csv --output-columns=email_stripped
10
+
11
+ # list of emails common to both csvs
12
+ orefine full_list.csv other_set_to_intersect_with.csv --common --output-columns=email
13
+
14
+ # merge data from another list & tag
15
+ orefine import_list.csv external_list_with_zip_and_state_data.csv --merge=zip,State --add-static-column=source,"LIST-TAG-DATA" --open
16
+
17
+ # tag a list
18
+ orefine import_list.csv --add-static-column=source,AUG-2013-POSTCARD-IMPORT --output-columns=email_stripped,source --stdout > ~/Desktop/list_import.csv
19
+ ```
20
+
21
+ ## Development Resources
22
+ * [OpenRefine API](https://github.com/OpenRefine/OpenRefine/blob/a7273625d7c33af70b6d16db5782c802186b3b99/main/webapp/modules/core/MOD-INF/controller.js)
23
+ * [GRel cross documentation](https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions)
24
+ * [Google Refine Gem](https://github.com/iloveitaly/refine-ruby)
25
+ * [Tutorial on merging datasets with a common column](http://blog.ouseful.info/2011/05/06/merging-datesets-with-common-columns-in-google-refine/)
26
+
27
+ ## Contributing
28
+
29
+ 1. Fork it
30
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
31
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
32
+ 4. Push to the branch (`git push origin my-new-feature`)
33
+ 5. Create new Pull Request
34
+
35
+ ## Authors
36
+ * Mike Bianco, @iloveitaly
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orefine
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Michael Bianco
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: google-refine
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 0.1.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 0.1.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: slop
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 3.4.6
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 3.4.6
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.3'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Easily modify CSVs from the command line using Open Refine
79
+ email:
80
+ - info@cliffsidedev.com
81
+ executables:
82
+ - orefine
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - .gitignore
87
+ - Gemfile
88
+ - LICENSE.txt
89
+ - Rakefile
90
+ - bin/orefine
91
+ - lib/orefine.rb
92
+ - lib/orefine/version.rb
93
+ - orefine.gemspec
94
+ - readme.md
95
+ homepage: http://github.com/iloveitaly/orefine
96
+ licenses:
97
+ - MIT
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ! '>='
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ requirements: []
115
+ rubyforge_project:
116
+ rubygems_version: 1.8.23
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Easily modify CSVs from the command line using Open Refine
120
+ test_files: []