google-refine 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 David Huynh & Max Ogden
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,80 @@
1
+ <h1>google-refine</h1> is a Ruby Gem client library for "Google Refine":http://code.google.com/p/google-refine/
2
+
3
+ h2. Install
4
+
5
+ @gem install google-refine@
6
+
7
+ h2. Example
8
+
9
+ Given that you have the following raw data:
10
+
11
+ <pre>
12
+ <code>
13
+ Date
14
+ 7 December 2001
15
+ July 1 2002
16
+ 10/20/10
17
+ </code>
18
+ </pre>
19
+
20
+ Google Refine lets you clean up the data and export your operation history as a JSON instruction set. Here is an example that extracts the year from the above dates:
21
+
22
+ <pre>
23
+ <code>
24
+ [
25
+ {
26
+ "op": "core/text-transform",
27
+ "description": "Text transform on cells in column Date using expression grel:value.toDate()",
28
+ "engineConfig": {
29
+ "facets": [],
30
+ "mode": "row-based"
31
+ },
32
+ "columnName": "Date",
33
+ "expression": "grel:value.toDate()",
34
+ "onError": "set-to-blank",
35
+ "repeat": false,
36
+ "repeatCount": 10
37
+ },
38
+ {
39
+ "op": "core/text-transform",
40
+ "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")+1",
41
+ "engineConfig": {
42
+ "facets": [],
43
+ "mode": "row-based"
44
+ },
45
+ "columnName": "Date",
46
+ "expression": "grel:value.datePart(\"year\")",
47
+ "onError": "set-to-blank",
48
+ "repeat": false,
49
+ "repeatCount": 10
50
+ }
51
+ ]
52
+ </code>
53
+ </pre>
54
+
55
+ You can use this gem to apply the operation set to the raw data from ruby. You will need to have Google Refine running on your local computer, or specify an external address (see source):
56
+
57
+ <pre>
58
+ <code>
59
+ prj = Refine.new('date cleanup', 'dates.txt')
60
+ prj.apply_operations('operations.json')
61
+ puts prj.export_rows('csv')
62
+ prj.delete_project
63
+ </code>
64
+ </pre>
65
+
66
+ Which outputs:
67
+
68
+ <pre>
69
+ <code>
70
+ Date
71
+ 2001
72
+ 2002
73
+ 2010
74
+ </code>
75
+ </pre>
76
+
77
+ h2. Copyright
78
+
79
+ Copyright (c) 2010 Max Ogden. See LICENSE for details.
80
+
data/Rakefile ADDED
@@ -0,0 +1,19 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "google-refine"
8
+ gem.summary = %Q{Client library for interacting with Google Refine instances}
9
+ gem.description = %Q{Client library for interacting with Google Refine instances}
10
+ gem.email = "max@maxogden.com"
11
+ gem.homepage = "http://github.com/maxogden/refine-ruby"
12
+ gem.authors = ["Max Ogden"]
13
+ gem.add_dependency "json", ">= 1.4.6"
14
+ gem.add_dependency "httpclient", ">= 2.1.6.1"
15
+ end
16
+ Jeweler::GemcutterTasks.new
17
+ rescue LoadError
18
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
19
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,54 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{google-refine}
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Max Ogden"]
12
+ s.date = %q{2011-01-22}
13
+ s.description = %q{Client library for interacting with Google Refine instances}
14
+ s.email = %q{max@maxogden.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.textile"
18
+ ]
19
+ s.files = [
20
+ "LICENSE",
21
+ "README.textile",
22
+ "Rakefile",
23
+ "VERSION",
24
+ "google-refine.gemspec",
25
+ "lib/google-refine.rb",
26
+ "test/dates.txt",
27
+ "test/operations.json",
28
+ "test/test.rb"
29
+ ]
30
+ s.homepage = %q{http://github.com/maxogden/refine-ruby}
31
+ s.require_paths = ["lib"]
32
+ s.rubygems_version = %q{1.3.7}
33
+ s.summary = %q{Client library for interacting with Google Refine instances}
34
+ s.test_files = [
35
+ "test/test.rb"
36
+ ]
37
+
38
+ if s.respond_to? :specification_version then
39
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
40
+ s.specification_version = 3
41
+
42
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
43
+ s.add_runtime_dependency(%q<json>, [">= 1.4.6"])
44
+ s.add_runtime_dependency(%q<httpclient>, [">= 2.1.6.1"])
45
+ else
46
+ s.add_dependency(%q<json>, [">= 1.4.6"])
47
+ s.add_dependency(%q<httpclient>, [">= 2.1.6.1"])
48
+ end
49
+ else
50
+ s.add_dependency(%q<json>, [">= 1.4.6"])
51
+ s.add_dependency(%q<httpclient>, [">= 2.1.6.1"])
52
+ end
53
+ end
54
+
@@ -0,0 +1,66 @@
1
+ require 'httpclient'
2
+ require 'cgi'
3
+ require 'json'
4
+
5
+ class Refine
6
+ def initialize(project_name, file_name, server="http://127.0.0.1:3333")
7
+ project_name = CGI.escape(project_name)
8
+ @server = server
9
+ @project_id = create_project(project_name, file_name)
10
+ @project_name = project_name if @project_id
11
+ end
12
+
13
+ def create_project(project_name, file_name)
14
+ uri = @server + "/command/core/create-project-from-upload"
15
+ project_id = false
16
+ client = HTTPClient.new(@server)
17
+ File.open(file_name) do |file|
18
+ body = {
19
+ 'project-file' => file,
20
+ 'project-name' => "awesome"
21
+ }
22
+ response = client.post(uri, body)
23
+ url = response.header['Location']
24
+ unless url == []
25
+ project_id = CGI.parse(url[0].split('?')[1])['project'][0]
26
+ end
27
+ end
28
+ raise "Error creating project: #{response}" unless project_id
29
+ project_id
30
+ end
31
+
32
+ def apply_operations(file_name)
33
+ raise "You must create a project" unless @project_id
34
+ uri = @server + "/command/core/apply-operations?project=#{@project_id}"
35
+ client = HTTPClient.new(@server)
36
+ File.open(file_name) do |file|
37
+ body = {
38
+ 'operations' => file.read
39
+ }
40
+ @response = client.post(uri, body)
41
+ end
42
+ JSON.parse(@response.content)['code'] rescue false
43
+ end
44
+
45
+ def export_rows(format='tsv')
46
+ uri = @server + "/command/core/export-rows/#{@project_name}.#{format}"
47
+ client = HTTPClient.new(@server)
48
+ body = {
49
+ 'engine' => '{"facets":[],"mode":"row-based"}',
50
+ 'project' => @project_id,
51
+ 'format' => format
52
+ }
53
+ @response = client.post(uri, body)
54
+ @response.content
55
+ end
56
+
57
+ def delete_project
58
+ uri = @server + "/command/core/delete-project"
59
+ client = HTTPClient.new(@server)
60
+ body = {
61
+ 'project' => @project_id
62
+ }
63
+ @response = client.post(uri, body)
64
+ JSON.parse(@response.content)['code'] rescue false
65
+ end
66
+ end
data/test/dates.txt ADDED
@@ -0,0 +1,4 @@
1
+ Date
2
+ 7 December 2001
3
+ July 1 2002
4
+ 10/20/10
@@ -0,0 +1,28 @@
1
+ [
2
+ {
3
+ "op": "core/text-transform",
4
+ "description": "Text transform on cells in column Date using expression grel:value.toDate()",
5
+ "engineConfig": {
6
+ "facets": [],
7
+ "mode": "row-based"
8
+ },
9
+ "columnName": "Date",
10
+ "expression": "grel:value.toDate()",
11
+ "onError": "set-to-blank",
12
+ "repeat": false,
13
+ "repeatCount": 10
14
+ },
15
+ {
16
+ "op": "core/text-transform",
17
+ "description": "Text transform on cells in column Date using expression grel:value.datePart(\"year\")",
18
+ "engineConfig": {
19
+ "facets": [],
20
+ "mode": "row-based"
21
+ },
22
+ "columnName": "Date",
23
+ "expression": "grel:value.datePart(\"year\")",
24
+ "onError": "set-to-blank",
25
+ "repeat": false,
26
+ "repeatCount": 10
27
+ }
28
+ ]
data/test/test.rb ADDED
@@ -0,0 +1,6 @@
1
+ load '../lib/refine.rb'
2
+
3
+ prj = Refine.new('date cleanup', 'dates.txt')
4
+ prj.apply_operations('operations.json')
5
+ puts prj.export_rows('csv')
6
+ prj.delete_project
metadata ADDED
@@ -0,0 +1,108 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google-refine
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Max Ogden
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-01-22 00:00:00 -08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: json
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 11
30
+ segments:
31
+ - 1
32
+ - 4
33
+ - 6
34
+ version: 1.4.6
35
+ type: :runtime
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: httpclient
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ">="
44
+ - !ruby/object:Gem::Version
45
+ hash: 125
46
+ segments:
47
+ - 2
48
+ - 1
49
+ - 6
50
+ - 1
51
+ version: 2.1.6.1
52
+ type: :runtime
53
+ version_requirements: *id002
54
+ description: Client library for interacting with Google Refine instances
55
+ email: max@maxogden.com
56
+ executables: []
57
+
58
+ extensions: []
59
+
60
+ extra_rdoc_files:
61
+ - LICENSE
62
+ - README.textile
63
+ files:
64
+ - LICENSE
65
+ - README.textile
66
+ - Rakefile
67
+ - VERSION
68
+ - google-refine.gemspec
69
+ - lib/google-refine.rb
70
+ - test/dates.txt
71
+ - test/operations.json
72
+ - test/test.rb
73
+ has_rdoc: true
74
+ homepage: http://github.com/maxogden/refine-ruby
75
+ licenses: []
76
+
77
+ post_install_message:
78
+ rdoc_options: []
79
+
80
+ require_paths:
81
+ - lib
82
+ required_ruby_version: !ruby/object:Gem::Requirement
83
+ none: false
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ hash: 3
88
+ segments:
89
+ - 0
90
+ version: "0"
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ none: false
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ hash: 3
97
+ segments:
98
+ - 0
99
+ version: "0"
100
+ requirements: []
101
+
102
+ rubyforge_project:
103
+ rubygems_version: 1.3.7
104
+ signing_key:
105
+ specification_version: 3
106
+ summary: Client library for interacting with Google Refine instances
107
+ test_files:
108
+ - test/test.rb