refine-ruby 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1308b88676521f405e1a5c11e3d3817ee42749de
4
+ data.tar.gz: f5c27243aaafaebb80ce2806dcea697425f57b6e
5
+ SHA512:
6
+ metadata.gz: 02faba1c28168da4e55a065bc8e2aa2ba33c60dec3c077e76597e4a826e06fb216a74864fefc82ed8b43120bb5d0af8ab953a3400e29b0e460843324de7d7984
7
+ data.tar.gz: 2b8b9a0ac382c4372701303d5504b2190d61cb134880a0cf989b914f5c134d3c3df4e958e4f3ab3f11c196001e8d9d8f10846cc70f479315d6c423d8afe9e89d
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ .DS_Store
2
+ Gemfile.lock
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source 'https://rubygems.org'
2
+
3
+ source ENV['PRIVATE_GEM_REPO'] if ENV['PRIVATE_GEM_REPO']
4
+ # Specify your gem's dependencies in da-api.gemspec
5
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2018 David Huynh, Max Ogden & Distill Analytics Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,84 @@
1
+ <h1>google-refine</h1> is a Ruby Gem client library for "OpenRefine":http://openrefine.org. It was written for Google Refine 2.x, but should work with OpenRefine as well.
2
+
3
+ If you want to port this to another language, check out the "Refine API":https://github.com/maxogden/refine-python/wiki/Refine-API documentation.
4
+
5
+ NOTE: The Refine client/server protocol is an internal API which is subject to change, so use at your own risk (although it has stayed relatively stable for the last few versions).
6
+
7
+ h2. Install
8
+
9
+ @gem install google-refine@
10
+
11
+ h2. Example
12
+
13
+ Given that you have the following raw data:
14
+
15
+ <pre>
16
+ <code>
17
+ Date
18
+ 7 December 2001
19
+ July 1 2002
20
+ 10/20/10
21
+ </code>
22
+ </pre>
23
+
24
+ Google Refine lets you clean up the data and export your operation history as a JSON instruction set. Here is an example that extracts the year from the above dates:
25
+
26
+ <pre>
27
+ <code>
28
+ [
29
+ {
30
+ "op": "core/text-transform",
31
+ "description": "Text transform on cells in column Date using expression grel:value.toDate()",
32
+ "engineConfig": {
33
+ "facets": [],
34
+ "mode": "row-based"
35
+ },
36
+ "columnName": "Date",
37
+ "expression": "grel:value.toDate()",
38
+ "onError": "set-to-blank",
39
+ "repeat": false,
40
+ "repeatCount": 10
41
+ },
42
+ {
43
+ "op": "core/text-transform",
44
+ "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")+1",
45
+ "engineConfig": {
46
+ "facets": [],
47
+ "mode": "row-based"
48
+ },
49
+ "columnName": "Date",
50
+ "expression": "grel:value.datePart(\"year\")",
51
+ "onError": "set-to-blank",
52
+ "repeat": false,
53
+ "repeatCount": 10
54
+ }
55
+ ]
56
+ </code>
57
+ </pre>
58
+
59
+ You can use this gem to apply the operation set to the raw data from ruby. You will need to have Google Refine running on your local computer, or specify an external address (see source):
60
+
61
+ <pre>
62
+ <code>
63
+ prj = Refine.new('project_name' => 'date cleanup', 'file_name' => 'dates.txt')
64
+ prj.apply_operations('operations.json')
65
+ puts prj.export_rows('csv')
66
+ prj.delete_project
67
+ </code>
68
+ </pre>
69
+
70
+ Which outputs:
71
+
72
+ <pre>
73
+ <code>
74
+ Date
75
+ 2001
76
+ 2002
77
+ 2010
78
+ </code>
79
+ </pre>
80
+
81
+ h2. Copyright
82
+
83
+ Copyright (c) 2011 David Huynh and Max Ogden. See LICENSE for details.
84
+
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.pattern = "test/test_*.rb"
6
+ end
7
+
8
+ task default: :test
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
data/bin/console ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "refine"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ require "pry"
11
+ Pry.start
data/lib/refine.rb ADDED
@@ -0,0 +1,231 @@
1
+ require 'httpclient'
2
+ require 'cgi'
3
+ require 'json'
4
+ require "addressable/uri"
5
+ require 'pry'
6
+
7
+ class Refine
8
+ attr_reader :project_name
9
+ attr_reader :project_id
10
+
11
+ def self.get_all_project_metadata(server="http://127.0.0.1:3333")
12
+ uri = "#{server}/command/core/get-all-project-metadata"
13
+ response = HTTPClient.new().get(uri)
14
+ JSON.parse(response.body)
15
+ end
16
+
17
+ def initialize(opts = {})
18
+ @server = opts["server"] || "http://127.0.0.1:3333"
19
+
20
+ @throws_exceptions = opts["throws_exceptions"] === false ? false : true
21
+
22
+ if opts["file_name"] && !opts["file_name"].empty? && opts["project_name"] && !opts["project_name"].empty?
23
+ project_name = CGI.escape(opts["project_name"])
24
+ @project_id = create_project(project_name, opts["file_name"])
25
+ @project_name = project_name if @project_id
26
+ else
27
+ @project_id = opts["project_id"]
28
+
29
+ metadata = self.get_project_metadata
30
+ @project_name = CGI.escape(metadata["name"])
31
+ end
32
+ end
33
+
34
+ def create_project(project_name, file_name)
35
+ uri = @server + "/command/core/create-project-from-upload"
36
+ project_id = false
37
+ File.open(file_name) do |file|
38
+ body = {
39
+ 'project-file' => file,
40
+ 'project-name' => project_name
41
+ }
42
+ response = client.post(uri, body)
43
+ url = response.header['Location']
44
+ unless url == []
45
+ project_id = CGI.parse(url[0].split('?')[1])['project'][0]
46
+ end
47
+ end
48
+ raise "Error creating project: #{response}" unless project_id
49
+ project_id
50
+ end
51
+
52
+ def apply_operations(file_name_or_string)
53
+ if File.exist?(file_name_or_string)
54
+ operations = File.read(file_name_or_string)
55
+ else
56
+ operations = file_name_or_string
57
+ end
58
+
59
+ call('apply-operations', 'operations' => operations)
60
+ end
61
+
62
+ def export_rows(opts={})
63
+ format = opts["format"] || 'tsv'
64
+ uri = @server + "/command/core/export-rows/#{@project_name}.#{format}"
65
+
66
+ body = {
67
+ 'engine' => {
68
+ "facets" => opts["facets"] || [],
69
+ "mode" => "row-based"
70
+ }.to_json,
71
+ 'options' => opts["options"] || '',
72
+ 'project' => @project_id,
73
+ 'format' => format
74
+ }
75
+
76
+ @response = client.post(uri, body)
77
+ @response.content
78
+ end
79
+
80
+ def delete_project
81
+ uri = @server + "/command/core/delete-project"
82
+ body = {
83
+ 'project' => @project_id
84
+ }
85
+ @response = client.post(uri, body)
86
+ JSON.parse(@response.content)['code'] rescue false
87
+ end
88
+
89
+ # this pattern is pulled from mailchimp/mailchimp-gem
90
+
91
+ def call(method, params = {})
92
+ uri = "#{@server}/command/core/#{method}"
93
+ params = { "project" => @project_id }.merge(params)
94
+
95
+ response = if method.start_with?('get-')
96
+ client.get(uri, params)
97
+ else
98
+ client.post(uri, params)
99
+ end
100
+
101
+ begin
102
+ response = JSON.parse(response.body)
103
+ rescue
104
+ response = JSON.parse('[' + response.body + ']').first
105
+ end
106
+
107
+ if @throws_exceptions && response.is_a?(Hash) && response["code"] && response["code"] == "error"
108
+ raise "API Error: #{response}"
109
+ end
110
+
111
+ response
112
+ end
113
+
114
+ def link_to_facets(*column_names)
115
+ uri = Addressable::URI.parse("#{@server}/project")
116
+
117
+ facet = facet_parameters(*column_names)
118
+
119
+ json_facet=JSON::dump(facets: facet).gsub(' ', "\t")
120
+
121
+ uri.query = Addressable::URI::form_encode({project: @project_id, ui: json_facet})
122
+
123
+ uri.to_s.gsub("%09", "%20")
124
+
125
+ end
126
+
127
+ def compute_facet(*column_names)
128
+
129
+ formatted = column_names.map do |column|
130
+ expression, sort_by, invert = facet_opts(column.values.first)
131
+ {
132
+ "columnName" => column.keys.first,
133
+ "expression" => expression,
134
+ "name" => column.keys.first,
135
+ "invert" => invert,
136
+ "sort" => sort_by,
137
+ "selection" => []
138
+ }
139
+ end
140
+
141
+ json_facet = JSON::dump(facets: [formatted.first])
142
+
143
+ openrefine_response = compute_facets("engine" => json_facet)
144
+
145
+ facet_response = openrefine_response.fetch("facets").first
146
+
147
+
148
+
149
+ if facet_response.key?("choices")
150
+
151
+ choice_hash = facet_response.fetch("choices").map do |h|
152
+ Hash[%w(value label count selection).zip([h["v"]["v"], h["v"]["l"], h["c"], h["s"]])]
153
+ end
154
+
155
+ response = choice_hash.inject({}) do |hash, choice|
156
+ hash.merge(choice["value"] => choice["count"])
157
+ end
158
+
159
+ else
160
+ response = "Error: " + facet_response.fetch("error")
161
+ end
162
+
163
+ end
164
+
165
+ def facet_parameters(*column_names)
166
+ column_names.map do |column|
167
+ case column when String then
168
+ {
169
+ "c" => {
170
+ "columnName" => column,
171
+ "expression"=>"value",
172
+ "name"=> column,
173
+ "invert"=> false
174
+ },
175
+ "o" => {
176
+ "sort" => "name"
177
+ }
178
+ }
179
+ when Hash
180
+ expression, sort_by, invert = facet_opts(column.values.first)
181
+
182
+ {
183
+ "c" => {
184
+ "columnName" => column.keys.first,
185
+ "expression"=> expression,
186
+ "name"=> column.keys.first,
187
+ "invert" => invert
188
+ },
189
+ "o" => {
190
+ "sort" => sort_by
191
+ }
192
+ }
193
+ end
194
+ end
195
+ end
196
+
197
+
198
+
199
+ def method_missing(method, *args)
200
+ # translate: get_column_info --> get-column-info
201
+ call(method.to_s.gsub('_', '-'), *args)
202
+ end
203
+
204
+ protected
205
+ def facet_opts(opts_array)
206
+ if opts_array.is_a? String
207
+ expression_present = opts_array.include? "value"
208
+ expression = expression_present ? opts_array : "value"
209
+ else
210
+ expression_present = opts_array[0].include? "value"
211
+ expression = expression_present ? opts_array[0] : "value"
212
+ end
213
+
214
+ sort_by = opts_array.include? "sort_count"
215
+ invert = opts_array.include? "invert"
216
+
217
+ sort_by = sort_by ? "count" : "name"
218
+ invert = invert ? true : false
219
+
220
+ return escape_backticks(expression), sort_by, invert
221
+ end
222
+
223
+ def escape_backticks(string)
224
+ string.gsub('//','////')
225
+ end
226
+
227
+ def client
228
+ @client ||= HTTPClient.new()
229
+ end
230
+
231
+ end
@@ -0,0 +1,3 @@
1
+ module Refine
2
+ VERSION = "0.2.2"
3
+ end
@@ -0,0 +1,38 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'refine/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{refine-ruby}
8
+ s.version = Refine::VERSION
9
+ s.authors = ["Caleb Buxton", "Allie Dukuze", "Max Ogden", "Michael Bianco"]
10
+ s.summary = %q{Client library for interacting with Google Refine instances}
11
+ s.description = %q{Client library for interacting with Google Refine instances. Easily work with CSVs from the command line}
12
+ s.email = %w{caleb+refine-ruby@distillanalytics.ca allie+refine-ruby@distillanalytics.ca}
13
+
14
+ s.extra_rdoc_files = [
15
+ "LICENSE",
16
+ "README.textile"
17
+ ]
18
+
19
+ s.files = `git ls-files -z`.split("\x0").reject do |f|
20
+ f.match(%r{^(test|spec|features)/})
21
+ end
22
+
23
+ s.homepage = "http://github.com/maxogden/refine-ruby"
24
+ s.require_paths = ["lib"]
25
+
26
+ s.test_files = `git ls-files -z`.split("\x0").select do |f|
27
+ f.match(%r{^(test|spec|features)/})
28
+ end
29
+
30
+ s.add_dependency "addressable", '~> 2.5'
31
+ s.add_dependency "httpclient", "~> 2.8.3"
32
+ s.add_dependency "json", "~> 2.1.0"
33
+ s.add_development_dependency "bundler", "~> 1.10"
34
+ s.add_development_dependency "minitest", "~> 5.11"
35
+ s.add_development_dependency "pry", "~> 0.11.3"
36
+ s.add_development_dependency "rake", "~> 10.0"
37
+
38
+ end
data/test/dates.csv ADDED
@@ -0,0 +1,4 @@
1
+ Date
2
+ 7 December 2001
3
+ July 1 2002
4
+ 10/20/10
data/test/dates.txt ADDED
@@ -0,0 +1,4 @@
1
+ Date
2
+ 7 December 2001
3
+ July 1 2002
4
+ 10/20/10
@@ -0,0 +1,109 @@
1
+ ### NOTE: The internal client-server protocol used by OpenRefine is not yet maintained as a stable external API, subject to change. ###
2
+ ### Therefore, plase indicate changes you notice to kittelmann@sub.uni-goettingen.de ###
3
+ ### Some examples require cURL http://curl.haxx.se ###
4
+ ### It is assumed that examples are run from the 'test' directory. Otherwise paths need to be adjusted.
5
+ load '../lib/refine.rb'
6
+
7
+ ##########################
8
+ ### create initial project
9
+ ##########################
10
+ prj = Refine.new({ 'project_name' => 'date_cleanup', 'file_name' => 'dates.csv' })
11
+
12
+
13
+ ##########################
14
+ ### create another project
15
+ ##########################
16
+ prj.create_project( 'date_cleanup', 'dates.txt' ) # return value = project id, example: 1484090391100
17
+
18
+
19
+ ################
20
+ ### do something
21
+ ################
22
+ prj.apply_operations( 'operations.json' ) # return value = status code, example: {'code'=>'ok'}
23
+
24
+
25
+ ######################
26
+ ### extract operations
27
+ ######################
28
+ prj.get_operations # return value = operations as Hash
29
+
30
+ ######################################
31
+ ### save extracted operations to file:
32
+ ######################################
33
+ extracted_operations = prj.get_operations
34
+ File.open('../test/extracted_operations.json', 'w') do |f|
35
+ f.write extracted_operations
36
+ end
37
+
38
+
39
+ ###############
40
+ ### export data
41
+ ###############
42
+ prj.export_rows # return value = exported data as tsv
43
+ prj.export_rows( {'format'=>'tsv'} ) # return value = exported data as tsv
44
+ prj.export_rows( {'format'=>'csv'} ) # return value = exported data as csv
45
+
46
+ ### export data in custom table format
47
+ prj.export_rows( { 'options'=>{'separator'=>';','lineSeparator'=>'\n'} } ) # return value = exported data as *sv with semicolon for separator
48
+
49
+ ### additional options available:
50
+ prj.export_rows( { 'options'=>{'separator'=>';','lineSeparator'=>'\n', 'outputColumnHeaders'=>true, 'outputBlankRows'=>true, 'columns'=>[{'name'=>'Date1'}] } } )
51
+ prj.export_rows( { 'options'=>{'separator'=>';','lineSeparator'=>'\n', 'outputColumnHeaders'=>false, 'outputBlankRows'=>false } } )
52
+
53
+ ### save extracted data to file:
54
+ exported_data = prj.export_rows( {'format'=>'csv'} )
55
+ File.open('../test/exported_data.csv', 'w') do |f| # works
56
+ f.write exported_data
57
+ end
58
+
59
+
60
+ ##################################
61
+ ### export data using own template
62
+ ##################################
63
+
64
+ ### construct template as url-encoded string
65
+ prefix = '%7B%0D%0A++%22rows%22+%3A+%5B%0D%0A'
66
+ suffix = '%0D%0A++%5D%0D%0A%7D'
67
+ separator = '%2C%0D%0A'
68
+ row_template = '++++%7B%0D%0A++++++%22Column+1%22+%3A+%7B%7Bjsonize%28cells%5B%22Column+1%22%5D.value%29%7D%7D%0D%0A++++%7D'
69
+
70
+ ### call (using cURL http://curl.haxx.se)
71
+ data = "engine=%7B%22facets%22%3A%5B%5D%2C%22mode%22%3A%22row-based%22%7D&project=#{prj.project_id}&format=template&sorting=%7B%22criteria%22%3A%5B%5D%7D&prefix=#{prefix}&suffix=#{suffix}&separator=#{separator}&template=#{row_template}"
72
+ system "curl --data #{'"' + data + '"'} http://127.0.0.1:3333/command/core/export-rows/"
73
+
74
+ ### save extracted data to file:
75
+ system "curl --data #{'"' + data + '"'} http://127.0.0.1:3333/command/core/export-rows/ > exported_data.json"
76
+
77
+ ### let Ruby do the URL encoding of the template
78
+ prefix = CGI.escape('{
79
+ "rows" : [
80
+ ')
81
+ suffix = CGI.escape('
82
+ ]
83
+ }')
84
+ separator = CGI.escape(',
85
+ ')
86
+ row_template = CGI.escape(' {
87
+ "Column 1" : {{jsonize(cells["Column 1"].value)}}
88
+ }')
89
+ data = "engine=%7B%22facets%22%3A%5B%5D%2C%22mode%22%3A%22row-based%22%7D&project=#{prj.project_id}&format=template&sorting=%7B%22criteria%22%3A%5B%5D%7D&prefix=#{prefix}&suffix=#{suffix}&separator=#{separator}&template=#{row_template}"
90
+ system "curl --data #{'"' + data + '"'} http://127.0.0.1:3333/command/core/export-rows/"
91
+
92
+
93
+ #################
94
+ ### rename column
95
+ #################
96
+ prj.rename_column( { 'oldColumnName'=>'Date', 'newColumnName'=>'Date1' } ) # return value = status Hash, e.g. {"code"=>"ok", "historyEntry"=>{"id"=>1438598625335, "description"=>"Rename column Date to Date1", "time"=>"2015-08-03T12:29:53Z"}}
97
+
98
+
99
+ ############
100
+ ### metadata
101
+ ############
102
+ prj.get_project_metadata # return value = metadata as Hash
103
+ prj.get_all_project_metadata # return value = metadata for all projects as Hash
104
+
105
+
106
+ ##################
107
+ ### delete project
108
+ ##################
109
+ prj.delete_project # return value = status, e.g. ok
@@ -0,0 +1,28 @@
1
+ [
2
+ {
3
+ "op": "core/text-transform",
4
+ "description": "Text transform on cells in column Date using expression grel:value.toDate()",
5
+ "engineConfig": {
6
+ "facets": [],
7
+ "mode": "row-based"
8
+ },
9
+ "columnName": "Date",
10
+ "expression": "grel:value.toDate()",
11
+ "onError": "set-to-blank",
12
+ "repeat": false,
13
+ "repeatCount": 10
14
+ },
15
+ {
16
+ "op": "core/text-transform",
17
+ "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")",
18
+ "engineConfig": {
19
+ "facets": [],
20
+ "mode": "row-based"
21
+ },
22
+ "columnName": "Date",
23
+ "expression": "grel:value.datePart(\"year\")",
24
+ "onError": "set-to-blank",
25
+ "repeat": false,
26
+ "repeatCount": 10
27
+ }
28
+ ]
data/test/test.rb ADDED
@@ -0,0 +1,6 @@
1
+ load '../lib/refine.rb'
2
+
3
+ prj = Refine.new({ "project_name" => 'date cleanup', "file_name" => 'dates.csv' })
4
+ prj.apply_operations('operations.json')
5
+ puts prj.export_rows('csv')
6
+ prj.delete_project
@@ -0,0 +1,232 @@
1
+ gem 'minitest'
2
+ require 'minitest/autorun'
3
+ require_relative '../lib/refine.rb'
4
+
5
+
6
+ describe Refine do
7
+
8
+ before do
9
+ @refine_project = Refine.new({ "project_name" => 'date_cleanup', "file_name" => './test/dates.txt' })
10
+ end
11
+
12
+ describe "error handling" do
13
+ it "throws an RuntimeError when @throws_exceptions == true (default)" do
14
+ faulty_operations = '[
15
+ "op": "core/text-transform",
16
+ "description": "Text transform on cells in column Date using expression grel:value.toDate()",
17
+ "engineConfig": {
18
+ "facets": [],
19
+ "mode": "row-based"
20
+ },
21
+ "onError": "set-to-blank",
22
+ "repeat": false,
23
+ "repeatCount": 10
24
+ }
25
+ ]'
26
+ proc {@refine_project.apply_operations(faulty_operations)}.must_raise RuntimeError
27
+ end
28
+
29
+ it "responds with error as a ruby hash when @throws_exceptions == false" do
30
+ @refine_project = Refine.new({ "project_name" => 'date_cleanup', "file_name" => './test/dates.txt', "throws_exceptions" => false })
31
+ faulty_operations = '[
32
+ "op": "core/text-transform",
33
+ "description": "Text transform on cells in column Date using expression grel:value.toDate()",
34
+ "engineConfig": {
35
+ "facets": [],
36
+ "mode": "row-based"
37
+ },
38
+ "onError": "set-to-blank",
39
+ "repeat": false,
40
+ "repeatCount": 10
41
+ }
42
+ ]'
43
+
44
+ _(@refine_project.apply_operations(faulty_operations)).must_equal (
45
+ {"stack"=>
46
+ "org.json.JSONException: Expected a ',' or ']' at 19 [character 17 line 2]\n\tat org.json.JSONTokener.syntaxError(JSONTokener.java:423)\n\tat org.json.JSONArray.<init>(JSONArray.java:143)\n\tat org.json.JSONTokener.nextValue(JSONTokener.java:356)\n\tat com.google.refine.util.ParsingUtilities.evaluateJsonStringToArray(ParsingUtilities.java:137)\n\tat com.google.refine.commands.history.ApplyOperationsCommand.doPost(ApplyOperationsCommand.java:63)\n\tat com.google.refine.RefineServlet.service(RefineServlet.java:177)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:820)\n\tat org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:511)\n\tat org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1166)\n\tat org.mortbay.servlet.UserAgentFilter.doFilter(UserAgentFilter.java:81)\n\tat org.mortbay.servlet.GzipFilter.doFilter(GzipFilter.java:155)\n\tat org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1157)\n\tat org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:388)\n\tat org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)\n\tat org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)\n\tat org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:765)\n\tat org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:418)\n\tat org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)\n\tat org.mortbay.jetty.Server.handle(Server.java:326)\n\tat org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)\n\tat org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:938)\n\tat org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:755)\n\tat org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218)\n\tat org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404)\n\tat org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:228)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)\n\tat java.lang.Thread.run(Thread.java:745)\n",
47
+ "code"=>"error",
48
+ "message"=>"Expected a ',' or ']' at 19 [character 17 line 2]"}
49
+ )
50
+ end
51
+
52
+ end
53
+
54
+ it "finding project through project id" do
55
+ new_refine_project = Refine.new({"project_name" => 'date_cleanup', "file_name" => './test/dates.txt'})
56
+
57
+ finding_new_refine_project_using_id = Refine.new("project_id"=> new_refine_project.project_id)
58
+
59
+ assert_equal new_refine_project.project_name, finding_new_refine_project_using_id.project_name
60
+
61
+ new_refine_project.delete_project
62
+ end
63
+
64
+ it "refine_initializer_has_instance_variable_project_name" do
65
+ assert_equal 'date_cleanup', @refine_project.project_name
66
+ end
67
+
68
+ it "refine_initializer_has_instance_variable_project_id" do
69
+ assert @refine_project.project_id.match(/^[0-9]+$/)
70
+ end
71
+
72
+ it "get_all_project_metadata" do
73
+ assert Refine.get_all_project_metadata.instance_of? Hash
74
+ end
75
+
76
+ it "apply_operations" do
77
+ assert @refine_project.apply_operations( './test/operations.json' )
78
+ end
79
+
80
+ it "call" do
81
+ assert @refine_project.call( 'apply-operations', 'operations' => File.read( './test/operations.json' ) )
82
+ end
83
+
84
+ describe "deep linking into a facet state" do
85
+
86
+ it "creates working url for custom expressions which include spilting a string with spaces (i.e. value.split(/[ -\/]/) )" do
87
+ facet_url = @refine_project.link_to_facets({"company"=>'filter(forEach(forEach(value.split(/[ -\/]/),v,v.replace(/^[^\w\s]/,"") ),v2,v2.replace(/[^\w\s]$/,"").toLowercase()),i,isNonBlank(i))'})
88
+ assert_includes facet_url, "filter%28forEach%28forEach%28value.split%28%2F%5B%20-%5C%5C%2F%5D%2F%29%2Cv%2Cv.replace%28%2F%5E%5B%5E%5C%5Cw%5C%5Cs%5D%2F%2C%5C%22%5C%22%29%20%20%20%20%29%2Cv2%2Cv2.replace%28%2F%5B%5E%5C%5Cw%5C%5Cs%5D%24%2F%2C%5C%22%5C%22%29.toLowercase%28%29%29%2Ci%2CisNonBlank%28i%29%29%22%2C%22name%22%3A%22company%22%2C%22invert%22%3Afalse%7D%2C%22o%22%3A%7B%22sort%22%3A%22name%22%7D%7D%5D%7D"
89
+ end
90
+
91
+ it "creates working url for custom expressions which uses the equal operator '=='" do
92
+ facet_url = @refine_project.link_to_facets({"transcript_company"=>'filter(forEach(cells.company.value.split(/[\p{Punct}[^\s\w]]/),company_part,or(with(value.replace(/[\p{Punct}[^\s\w]]/,"").phonetic(),left,with(phonetic(company_part),right,or(left==right,or(left.contains(right),right.contains(left))))),with(value.replace(/[\p{Punct}[^\s\w]]/,"").fingerprint(),left,with(fingerprint(company_part.replace(/[\p{Punct}[^\s\w]]/,"")),right,or(left==right,or(left.contains(right),right.contains(left))))))),v,v).uniques().length()>=1'})
93
+ assert_includes facet_url, "filter%28forEach%28cells.company.value.split%28%2F%5B%5C%5Cp%7BPunct%7D%5B%5E%5C%5Cs%5C%5Cw%5D%5D%2F%29%2Ccompany_part%2Cor%28with%28value.replace%28%2F%5B%5C%5Cp%7BPunct%7D%5B%5E%5C%5Cs%5C%5Cw%5D%5D%2F%2C%5C%22%5C%22%29.phonetic%28%29%2Cleft%2Cwith%28phonetic%28company_part%29%2Cright%2Cor%28left%3D%3Dright%2Cor%28left.contains%28right%29%2Cright.contains%28left%29%29%29%29%29%2Cwith%28value.replace%28%2F%5B%5C%5Cp%7BPunct%7D%5B%5E%5C%5Cs%5C%5Cw%5D%5D%2F%2C%5C%22%5C%22%29.fingerprint%28%29%2Cleft%2Cwith%28fingerprint%28company_part.replace%28%2F%5B%5C%5Cp%7BPunct%7D%5B%5E%5C%5Cs%5C%5Cw%5D%5D%2F%2C%5C%22%5C%22%29%29%2Cright%2Cor%28left%3D%3Dright%2Cor%28left.contains%28right%29%2Cright.contains%28left%29%29%29%29%29%29%29%2Cv%2Cv%29.uniques%28%29.length%28%29%3E%3D1"
94
+ end
95
+
96
+ it "by generating urls based on a simple facet specification" do
97
+ # urls need the terms facet structure in json, encoded, as the UI parameter
98
+ raw_url = @refine_project.link_to_facets("Date")
99
+
100
+ url = URI::parse(raw_url)
101
+
102
+ assert !url.query.empty?
103
+
104
+ params = Hash[url.query.split("&").map{|i| i.split("=")}]
105
+
106
+ assert !params['ui'].nil?
107
+
108
+ facet_spec = JSON::parse(URI::decode(params['ui']))
109
+ assert_equal 1, facet_spec.fetch("facets").length
110
+
111
+ assert_equal "Date", facet_spec.fetch("facets").first.fetch("c").fetch("columnName")
112
+ assert_equal "Date", facet_spec.fetch("facets").first.fetch("c").fetch("name")
113
+ end
114
+
115
+ describe "depends on generating a ui query" do
116
+ describe "valid facet query" do
117
+ it "needs a control key ('c') for each facet" do
118
+ structure = @refine_project.facet_parameters("Date")
119
+ assert_includes structure.first.keys, "c"
120
+ end
121
+
122
+ it "each facet needs a columnName" do
123
+ facet = @refine_project.facet_parameters("Date").first.fetch('c')
124
+ assert_includes facet.keys, "columnName"
125
+ end
126
+ it "each facet needs an expression" do
127
+ facet = @refine_project.facet_parameters("Date").first.fetch('c')
128
+ assert_includes facet.keys, "expression"
129
+ end
130
+ it "each facet needs a name" do
131
+ facet = @refine_project.facet_parameters("Date").first.fetch('c')
132
+ assert_includes facet.keys, "name"
133
+ end
134
+ end
135
+
136
+ it "creates text facets for each column name provided" do
137
+ date_facet, company_facet = @refine_project.facet_parameters("Date","company")
138
+
139
+ assert_equal "Date", date_facet.fetch("c").fetch("columnName")
140
+ assert_equal "Date", date_facet.fetch("c").fetch("name")
141
+
142
+ assert_equal "company", company_facet.fetch("c").fetch("columnName")
143
+ assert_equal "company", company_facet.fetch("c").fetch("name")
144
+ end
145
+
146
+ describe "with custom expressions" do
147
+ it "has a `value` expression by default" do
148
+ date_facet, company_facet = @refine_project.facet_parameters("Date","company")
149
+
150
+ assert_equal "value", date_facet.fetch("c").fetch("expression")
151
+ assert_equal "value", company_facet.fetch("c").fetch("expression")
152
+ end
153
+
154
+ # our own expression
155
+ it "sanitizes custom expressions by escaping backticks" do
156
+ date_facet = @refine_project.link_to_facets("Date"=>'value. [\w\s] = ')
157
+ assert_includes date_facet, "value.%20%5B%5C%5Cw%5C%5Cs%5D%20%3D%20"
158
+ end
159
+
160
+ it "can pass custom expressions through a hash" do
161
+ date_facet, company_facet = @refine_project
162
+ .facet_parameters({"Date" => "value.utcTime()"},{"Company"=>"value.titleCase()"})
163
+
164
+ assert_equal "value.utcTime()", date_facet.fetch("c").fetch("expression")
165
+ assert_equal "Date", date_facet.fetch("c").fetch("columnName")
166
+
167
+ assert_equal "value.titleCase()", company_facet.fetch("c").fetch("expression")
168
+ assert_equal "Company", company_facet.fetch("c").fetch("columnName")
169
+ end
170
+ end
171
+
172
+ describe "with custom sorting" do
173
+ # this adds an 'o' data structure for order
174
+
175
+ it "sorts by `name` by default" do # ie existing test cases
176
+ date_facet = @refine_project.facet_parameters("Date")
177
+ assert_equal "name", date_facet.first.fetch("o").fetch("sort")
178
+ end
179
+
180
+ it "sorts by `count` when specified" do # need to choose a new signature / api
181
+ date_facet =@refine_project.facet_parameters({"Date"=>["value.utcTime()", "sort_count"]})
182
+ assert_equal "count", date_facet.first.fetch("o").fetch("sort")
183
+
184
+ end
185
+ end
186
+
187
+ describe "inversion" do
188
+ # find where to put these arguments in, probably 'c'
189
+ it "default's to not inverted" do
190
+ date_facet = @refine_project.facet_parameters("Date")
191
+ assert_equal false, date_facet.first.fetch("c").fetch("invert")
192
+ end
193
+ it "can specify to invert" do
194
+ date_facet = @refine_project.facet_parameters({"Date"=>["value.utcTime()", "invert"]})
195
+ assert_equal true, date_facet.first.fetch("c").fetch("invert")
196
+ end
197
+ end
198
+ end
199
+
200
+ describe "compute_facets" do
201
+ it "Request responds with error due to non existent column" do
202
+ response = @refine_project.compute_facet({"transcript_fiscal_year"=> ["isNonBlank(value)"]})
203
+ assert_equal("Error: No column named transcript_fiscal_year", response)
204
+ end
205
+
206
+ it "Request executes expression and sends response with the results" do
207
+ response = @refine_project.compute_facet({"Column 1"=> ["isNonBlank(value)"]})
208
+ assert_equal({true=>4}, response)
209
+ end
210
+
211
+ it "Request with faulty expression" do
212
+ response = @refine_project.compute_facet({"Column 1"=> ["iBlank(value)"]})
213
+ assert_equal("Error: Parsing error at offset 6: Unknown function or control named iBlank", response)
214
+ end
215
+ end
216
+
217
+ it "generates a link to the server" do
218
+ assert(URI::HTTP === URI::parse(@refine_project.link_to_facets("Date")))
219
+ end
220
+
221
+ # @refine_project.link_to_facets(...)
222
+ # => http://foo.bar/project/adfasdfsdf
223
+ # it "succeeds, with status 200 and a document" do
224
+ # assert(HTTPClient.get(@refine_project.link_to_facets("Date")).code == 200)
225
+ # end
226
+ end
227
+
228
+ after do
229
+ @refine_project.delete_project
230
+ end
231
+
232
+ end
metadata ADDED
@@ -0,0 +1,170 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: refine-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.2
5
+ platform: ruby
6
+ authors:
7
+ - Caleb Buxton
8
+ - Allie Dukuze
9
+ - Max Ogden
10
+ - Michael Bianco
11
+ autorequire:
12
+ bindir: bin
13
+ cert_chain: []
14
+ date: 2018-05-15 00:00:00.000000000 Z
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: addressable
18
+ requirement: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - "~>"
21
+ - !ruby/object:Gem::Version
22
+ version: '2.5'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '2.5'
30
+ - !ruby/object:Gem::Dependency
31
+ name: httpclient
32
+ requirement: !ruby/object:Gem::Requirement
33
+ requirements:
34
+ - - "~>"
35
+ - !ruby/object:Gem::Version
36
+ version: 2.8.3
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: 2.8.3
44
+ - !ruby/object:Gem::Dependency
45
+ name: json
46
+ requirement: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - "~>"
49
+ - !ruby/object:Gem::Version
50
+ version: 2.1.0
51
+ type: :runtime
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: 2.1.0
58
+ - !ruby/object:Gem::Dependency
59
+ name: bundler
60
+ requirement: !ruby/object:Gem::Requirement
61
+ requirements:
62
+ - - "~>"
63
+ - !ruby/object:Gem::Version
64
+ version: '1.10'
65
+ type: :development
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '1.10'
72
+ - !ruby/object:Gem::Dependency
73
+ name: minitest
74
+ requirement: !ruby/object:Gem::Requirement
75
+ requirements:
76
+ - - "~>"
77
+ - !ruby/object:Gem::Version
78
+ version: '5.11'
79
+ type: :development
80
+ prerelease: false
81
+ version_requirements: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - "~>"
84
+ - !ruby/object:Gem::Version
85
+ version: '5.11'
86
+ - !ruby/object:Gem::Dependency
87
+ name: pry
88
+ requirement: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - "~>"
91
+ - !ruby/object:Gem::Version
92
+ version: 0.11.3
93
+ type: :development
94
+ prerelease: false
95
+ version_requirements: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: 0.11.3
100
+ - !ruby/object:Gem::Dependency
101
+ name: rake
102
+ requirement: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - "~>"
105
+ - !ruby/object:Gem::Version
106
+ version: '10.0'
107
+ type: :development
108
+ prerelease: false
109
+ version_requirements: !ruby/object:Gem::Requirement
110
+ requirements:
111
+ - - "~>"
112
+ - !ruby/object:Gem::Version
113
+ version: '10.0'
114
+ description: Client library for interacting with Google Refine instances. Easily work
115
+ with CSVs from the command line
116
+ email:
117
+ - caleb+refine-ruby@distillanalytics.ca
118
+ - allie+refine-ruby@distillanalytics.ca
119
+ executables: []
120
+ extensions: []
121
+ extra_rdoc_files:
122
+ - LICENSE
123
+ - README.textile
124
+ files:
125
+ - ".gitignore"
126
+ - Gemfile
127
+ - LICENSE
128
+ - README.textile
129
+ - Rakefile
130
+ - VERSION
131
+ - bin/console
132
+ - lib/refine.rb
133
+ - lib/refine/version.rb
134
+ - refine-ruby.gemspec
135
+ - test/dates.csv
136
+ - test/dates.txt
137
+ - test/examples_of_usage.rb
138
+ - test/operations.json
139
+ - test/test.rb
140
+ - test/test_refine.rb
141
+ homepage: http://github.com/maxogden/refine-ruby
142
+ licenses: []
143
+ metadata: {}
144
+ post_install_message:
145
+ rdoc_options: []
146
+ require_paths:
147
+ - lib
148
+ required_ruby_version: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ required_rubygems_version: !ruby/object:Gem::Requirement
154
+ requirements:
155
+ - - ">="
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
158
+ requirements: []
159
+ rubyforge_project:
160
+ rubygems_version: 2.6.13
161
+ signing_key:
162
+ specification_version: 4
163
+ summary: Client library for interacting with Google Refine instances
164
+ test_files:
165
+ - test/dates.csv
166
+ - test/dates.txt
167
+ - test/examples_of_usage.rb
168
+ - test/operations.json
169
+ - test/test.rb
170
+ - test/test_refine.rb