orefine 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ Gemfile.lock
2
+ pkg/
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Michael Bianco
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'orefine'
4
+ require 'slop'
5
+
6
+ include Orefine
7
+
8
+ $opts = Slop.parse do
9
+ banner 'Usage: refine.rb csv_a csv_b [options]'
10
+
11
+ on 'output-columns=', 'List of columns you want in the resulting csv', as: Array
12
+ on 'delete-columns=', 'What columns to delete from the output', as: Array
13
+ on 'merge=', 'What column to merge in from csv_b', as: Array
14
+ on 'add-static-column=', 'Add a column with a static value (input: key, value)', as: Array
15
+ on 'merge-common=', 'Merge values from common columns with a comma'
16
+
17
+ on 'diff', 'only output rows in csv_a whose email does not exist in csv_b'
18
+ on 'common', 'only output rows common to both csvs'
19
+ on 'split-full-name', 'split a full name field into first and last'
20
+
21
+ on 'open', 'open the document in a web browser'
22
+ on 'stdout', 'write the resulting csv to stdout'
23
+ end
24
+
25
+ if ARGV.empty?
26
+ puts $opts.help
27
+ exit
28
+ end
29
+
30
+
31
+ csv_a_path = ARGV[0]
32
+ csv_b_path = ARGV[1]
33
+
34
+ # TODO clear out all old CSV a & b or timestamp the new ones
35
+
36
+ CSVUtil.clear_all_csvs
37
+ csv_a = Refine.new("project_name" => 'csv_a', "file_name" => csv_a_path)
38
+ csv_b = Refine.new("project_name" => 'csv_b', "file_name" => csv_b_path) if !csv_b_path.nil? && File.exists?(csv_b_path)
39
+
40
+ all_csvs = [csv_a]
41
+ all_csvs << csv_b if !csv_b.nil?
42
+
43
+ CSVUtil.normalize_column_names(all_csvs)
44
+ CSVUtil.normalize_email_column_content(all_csvs)
45
+ CSVUtil.create_common_flag(csv_a, csv_b) if !csv_b.nil?
46
+
47
+ CSVUtil.split_full_name(csv_a) if !$opts['split-full-name'].nil?
48
+
49
+ if !$opts['merge'].nil?
50
+ $opts['merge'].each do |merge_field|
51
+ CSVUtil.merge_field(csv_a, csv_b, merge_field)
52
+ end
53
+ end
54
+
55
+ if !$opts['merge-common'].nil?
56
+ $opts['merge-common'].each do |common_merge|
57
+ CSVUtil.merge_common_field(csv_a, csv_b, common_merge)
58
+ end
59
+ end
60
+
61
+ if !$opts['add-static-column'].nil?
62
+ CSVUtil.add_column(csv_a, $opts['add-static-column'].first, $opts['add-static-column'].last)
63
+ end
64
+
65
+ output_params = { "format" => "csv" }
66
+
67
+ if !$opts['output-columns'].nil?
68
+ output_params["options"] ||= {}
69
+ output_params["options"]["columns"] ||= []
70
+
71
+ $opts['output-columns'].each do |c|
72
+ output_params["options"]["columns"] << { "name" => c }
73
+ end
74
+ end
75
+
76
+ if !$opts['delete-columns'].nil?
77
+ $opts['delete-columns'].each do |column_name|
78
+ CSVUtil.delete_column(csv_a, column_name)
79
+ end
80
+ end
81
+
82
+ if !csv_b.nil? && ($opts.diff? || $opts.common?)
83
+ flag = true
84
+ flag = false if $opts.diff?
85
+
86
+ output_params["facets"] = [ CSVUtil.common_facet(flag) ]
87
+ end
88
+
89
+ puts csv_a.export_rows(output_params) if $opts.stdout?
90
+
91
+ `open "http://127.0.0.1:3333/project?project=#{csv_a.project_id}"` if $opts.open?
@@ -0,0 +1,234 @@
1
+ require "orefine/version"
2
+ require 'google-refine'
3
+
4
+ module Orefine
5
+ class CSVUtil
6
+ class << self
7
+ def clear_all_csvs
8
+ Refine.get_all_project_metadata["projects"]
9
+ .select { |k, v| v["name"].start_with?('csv_') }
10
+ .keys
11
+ .map { |project_id| Refine.new("project_id" => project_id) }
12
+ .map(&:delete_project)
13
+ end
14
+
15
+ def normalize_column_names(projects)
16
+ self.normalize_email_column_name(projects)
17
+ self.normalize_zip_column_name(projects)
18
+ self.normalize_full_name_column_name(projects)
19
+ end
20
+
21
+ def normalize_email_column_name(projects)
22
+ self.perform_operation(projects, %q{
23
+ [
24
+ {
25
+ "op": "core/column-rename",
26
+ "oldColumnName": "E-mail",
27
+ "newColumnName": "email"
28
+ },
29
+ {
30
+ "op": "core/column-rename",
31
+ "oldColumnName": "Email Address",
32
+ "newColumnName": "email"
33
+ },
34
+ {
35
+ "op": "core/column-rename",
36
+ "oldColumnName": "Email",
37
+ "newColumnName": "email"
38
+ },
39
+ {
40
+ "op": "core/column-rename",
41
+ "oldColumnName": "[email]",
42
+ "newColumnName": "email"
43
+ },
44
+ {
45
+ "op": "core/column-rename",
46
+ "oldColumnName": "email_stripped",
47
+ "newColumnName": "email"
48
+ },
49
+ ]
50
+ })
51
+ end
52
+
53
+ def normalize_zip_column_name(projects)
54
+ self.perform_operation(projects, %q{
55
+ [
56
+ {
57
+ "op": "core/column-rename",
58
+ "oldColumnName": "Zip",
59
+ "newColumnName": "zip"
60
+ },
61
+ {
62
+ "op": "core/column-rename",
63
+ "oldColumnName": "[zip]",
64
+ "newColumnName": "zip"
65
+ }
66
+ ]
67
+ })
68
+ end
69
+
70
+ def normalize_full_name_column_name(projects)
71
+ self.perform_operation(projects, %q{
72
+ [
73
+ {
74
+ "op": "core/column-rename",
75
+ "oldColumnName": "Name",
76
+ "newColumnName": "full_name"
77
+ },
78
+ {
79
+ "op": "core/column-rename",
80
+ "oldColumnName": "Full Name",
81
+ "newColumnName": "full_name"
82
+ }
83
+ ]
84
+ })
85
+ end
86
+
87
+ def normalize_email_column_content(projects)
88
+ self.perform_operation(projects, %q{
89
+ [
90
+ {
91
+ "op": "core/column-addition",
92
+ "engineConfig": {
93
+ "facets": [],
94
+ "mode": "record-based"
95
+ },
96
+ "newColumnName": "email_stripped",
97
+ "columnInsertIndex": 0,
98
+ "baseColumnName": "email",
99
+ "expression": "grel:strip(value.toLowercase())",
100
+ "onError": "set-to-blank"
101
+ }
102
+ ]
103
+ })
104
+ end
105
+
106
+ def split_full_name(projects)
107
+ self.perform_operation(projects, %q{
108
+ [
109
+ {
110
+ "op": "core/column-split",
111
+ "description": "Split column Name by separator",
112
+ "engineConfig": {
113
+ "facets": [],
114
+ "mode": "row-based"
115
+ },
116
+ "columnName": "full_name",
117
+ "guessCellType": false,
118
+ "removeOriginalColumn": false,
119
+ "mode": "separator",
120
+ "separator": "(?<=[a-z]) ",
121
+ "regex": true,
122
+ "maxColumns": 2
123
+ }
124
+ ]
125
+ })
126
+ end
127
+
128
+ def create_common_flag(project_a, project_b)
129
+ if project_a.get_columns_info.map { |c| c["name"] }.include? 'exists'
130
+ STDERR.puts "'exists' column already exists in csv_a, deleting"
131
+ self.delete_column(project_a, "exists")
132
+ end
133
+
134
+ self.perform_operation(project_a, %Q{
135
+ [
136
+ {
137
+ "op": "core/column-addition",
138
+ "engineConfig": {
139
+ "facets": [],
140
+ "mode": "record-based"
141
+ },
142
+ "newColumnName": "exists",
143
+ "columnInsertIndex": 0,
144
+ "baseColumnName": "email_stripped",
145
+ "expression": "grel:cell.cross(\\\"#{project_b.project_name}\\\", \\\"email_stripped\\\").cells.length() > 0",
146
+ "onError": "set-to-blank"
147
+ }
148
+ ]
149
+ })
150
+ end
151
+
152
+ def merge_field(project_a, project_b, field)
153
+ self.perform_operation(project_a, %Q{
154
+ [
155
+ {
156
+ "op": "core/column-addition",
157
+ "engineConfig": {
158
+ "facets": [],
159
+ "mode": "record-based"
160
+ },
161
+ "newColumnName": "#{field}_merged",
162
+ "columnInsertIndex": 0,
163
+ "baseColumnName": "email_stripped",
164
+ "expression": "grel:cell.cross(\\\"#{project_b.project_name}\\\", \\\"email_stripped\\\").cells[\\\"#{field}\\\"].value[0]",
165
+ "onError": "set-to-blank"
166
+ }
167
+ ]
168
+ })
169
+ end
170
+
171
+ def merge_common_field(csv_a, csv_b, common_field)
172
+
173
+ end
174
+
175
+ def delete_column(csv, field)
176
+ self.perform_operation(csv, %Q{
177
+ [
178
+ {
179
+ "op": "core/column-removal",
180
+ "columnName": "#{field}"
181
+ }
182
+ ]
183
+ })
184
+ end
185
+
186
+ def add_column(csv, field, value)
187
+ self.perform_operation(csv, %Q{
188
+ [
189
+ {
190
+ "op": "core/column-addition",
191
+ "engineConfig": {
192
+ "facets": [],
193
+ "mode": "record-based"
194
+ },
195
+ "newColumnName": "#{field}",
196
+ "columnInsertIndex": 0,
197
+ "baseColumnName": "email_stripped",
198
+ "expression": "grel:\\\"#{value}\\\"",
199
+ "onError": "set-to-blank"
200
+ }
201
+ ]
202
+ })
203
+ end
204
+
205
+ def common_facet(flag = true)
206
+ {
207
+ "invert" => false,
208
+ "expression" => "value",
209
+ "selectError" => false,
210
+ "omitError" => false,
211
+ "selectBlank" => false,
212
+ "name" => "exists",
213
+ "omitBlank" => false,
214
+ "columnName" => "exists",
215
+ "type" => "list",
216
+ "selection" => [
217
+ {
218
+ "v" => {
219
+ # string vs boolean matters here... be careful
220
+ "v" => flag,
221
+ "l" => flag,
222
+ }
223
+ }
224
+ ]
225
+ }
226
+ end
227
+
228
+ def perform_operation(projects, operation)
229
+ projects = [projects] if !projects.is_a?(Array)
230
+ projects.each { |p| p.apply_operations(operation) }
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,3 @@
1
+ module Orefine
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'orefine/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "orefine"
8
+ s.version = Orefine::VERSION
9
+ s.authors = ["Michael Bianco"]
10
+ s.email = ["info@cliffsidedev.com"]
11
+ s.description = %q{Easily modify CSVs from the command line using Open Refine}
12
+ s.summary = %q{Easily modify CSVs from the command line using Open Refine}
13
+ s.homepage = "http://github.com/iloveitaly/orefine"
14
+ s.license = "MIT"
15
+
16
+ s.files = `git ls-files`.split($/)
17
+ s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ s.test_files = s.files.grep(%r{^(test|spec|features)/})
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_dependency 'google-refine', '~> 0.1.1'
22
+ s.add_dependency 'slop', '~> 3.4.6'
23
+
24
+ s.add_development_dependency "bundler", "~> 1.3"
25
+ s.add_development_dependency "rake"
26
+ end
@@ -0,0 +1,36 @@
1
+ # ORefine - CLI for Working With Open Refine
2
+ Makes working with CSVs a bit less painful. Tries to automate some common operations. Contributions welcome, this is a rough cut with the minimal features I needed to get a job done.
3
+
4
+ You'll need [OpenRefine](https://github.com/OpenRefine/OpenRefine) installed & running.
5
+
6
+ ## Examples
7
+ ```
8
+ # output a list of stripped emails from a CSV
9
+ orefine a_list_of_emails_and_other_columns.csv --output-columns=email_stripped
10
+
11
+ # list of emails common to both csvs
12
+ orefine full_list.csv other_set_to_intersect_with.csv --common --output-columns=email
13
+
14
+ # merge data from another list & tag
15
+ orefine import_list.csv external_list_with_zip_and_state_data.csv --merge=zip,State --add-static-column=source,"LIST-TAG-DATA" --open
16
+
17
+ # tag a list
18
+ orefine import_list.csv --add-static-column=source,AUG-2013-POSTCARD-IMPORT --output-columns=email_stripped,source --stdout > ~/Desktop/list_import.csv
19
+ ```
20
+
21
+ ## Development Resources
22
+ * [OpenRefine API](https://github.com/OpenRefine/OpenRefine/blob/a7273625d7c33af70b6d16db5782c802186b3b99/main/webapp/modules/core/MOD-INF/controller.js)
23
+ * [GRel cross documentation](https://github.com/OpenRefine/OpenRefine/wiki/GREL-Other-Functions)
24
+ * [Google Refine Gem](https://github.com/iloveitaly/refine-ruby)
25
+ * [Tutorial on merging datasets with a common column](http://blog.ouseful.info/2011/05/06/merging-datesets-with-common-columns-in-google-refine/)
26
+
27
+ ## Contributing
28
+
29
+ 1. Fork it
30
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
31
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
32
+ 4. Push to the branch (`git push origin my-new-feature`)
33
+ 5. Create new Pull Request
34
+
35
+ ## Authors
36
+ * Mike Bianco, @iloveitaly
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orefine
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Michael Bianco
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-10-08 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: google-refine
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 0.1.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: 0.1.1
30
+ - !ruby/object:Gem::Dependency
31
+ name: slop
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 3.4.6
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 3.4.6
46
+ - !ruby/object:Gem::Dependency
47
+ name: bundler
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: '1.3'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '1.3'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ description: Easily modify CSVs from the command line using Open Refine
79
+ email:
80
+ - info@cliffsidedev.com
81
+ executables:
82
+ - orefine
83
+ extensions: []
84
+ extra_rdoc_files: []
85
+ files:
86
+ - .gitignore
87
+ - Gemfile
88
+ - LICENSE.txt
89
+ - Rakefile
90
+ - bin/orefine
91
+ - lib/orefine.rb
92
+ - lib/orefine/version.rb
93
+ - orefine.gemspec
94
+ - readme.md
95
+ homepage: http://github.com/iloveitaly/orefine
96
+ licenses:
97
+ - MIT
98
+ post_install_message:
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ none: false
104
+ requirements:
105
+ - - ! '>='
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
108
+ required_rubygems_version: !ruby/object:Gem::Requirement
109
+ none: false
110
+ requirements:
111
+ - - ! '>='
112
+ - !ruby/object:Gem::Version
113
+ version: '0'
114
+ requirements: []
115
+ rubyforge_project:
116
+ rubygems_version: 1.8.23
117
+ signing_key:
118
+ specification_version: 3
119
+ summary: Easily modify CSVs from the command line using Open Refine
120
+ test_files: []