google_refine 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/bin/upload-to-refine +43 -0
  2. data/lib/google_refine.rb +105 -0
  3. metadata +79 -0
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/google_refine', __FILE__)
4
+
5
+ require 'trollop'
6
+ require 'tempfile'
7
+
8
+ opts = Trollop::options do
9
+ opt :host, "Address of Google Refine", :default => '0.0.0.0:3333'
10
+ end
11
+
12
+
13
+ Signal.trap("INT") { puts "Terminating: #{$$}" }
14
+
15
+ host = opts[:host]
16
+
17
+ file = nil
18
+
19
+ if ARGV.size > 0
20
+ file = ARGV.shift
21
+ abort "File \"#{file}\" does not exists." if not File.exists?(file)
22
+ end
23
+
24
+ tempfile = nil
25
+
26
+ if file.nil?
27
+ if STDIN.tty?
28
+ abort "The STDIN should not be a TTY."
29
+ else
30
+ tempfile = Tempfile.new('upload-to-pipelines')
31
+ tempfile.close
32
+
33
+ IO.copy_stream(STDIN, tempfile)
34
+ file = tempfile.path
35
+ end
36
+ end
37
+
38
+
39
+ refine = Refine.new(host)
40
+
41
+ puts refine.create_project(file, :projectName => file)
42
+
43
+ tempfile.unlink if not tempfile.nil?
@@ -0,0 +1,105 @@
1
+ require 'rest_client'
2
+ require 'json'
3
+
4
+ class Project
5
+
6
+ attr_accessor :id
7
+ attr_accessor :refine
8
+
9
+ def initialize(refine, id)
10
+ self.refine = refine
11
+ self.id = id
12
+ end
13
+
14
+ def to_s
15
+ "#{self.refine.url}/project?project=#{self.id}"
16
+ end
17
+ end
18
+
19
+ class Job
20
+
21
+ attr_accessor :id
22
+ attr_accessor :refine
23
+
24
+ def initialize(refine, id)
25
+ self.refine = refine
26
+ self.id = id
27
+ end
28
+
29
+ def load_raw_data(filename)
30
+ RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=load-raw-data", :upload => File.new(filename, "rb"))
31
+
32
+ while true
33
+ sleep 2
34
+ status = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
35
+ break if JSON[status]["job"]["config"]["state"] == "ready"
36
+ end
37
+ end
38
+
39
+ def create_project(options)
40
+ RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=create-project",
41
+ :format => "text/line-based/*sv",
42
+ :options => options.to_json)
43
+
44
+ project_id = nil
45
+ while project_id.nil?
46
+ sleep 2
47
+ response = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
48
+ project_id = JSON[response]["job"]["config"]["projectID"]
49
+ end
50
+
51
+ Project.new(self.refine, project_id)
52
+ end
53
+
54
+ def cancel
55
+ RestClient.post("#{self.refine.url}/command/core/cancel-importing-job?jobID=#{self.id}", nil)
56
+ end
57
+
58
+ end
59
+
60
+ class Refine
61
+ attr_accessor :url
62
+
63
+ def initialize(url)
64
+ self.url = url
65
+ end
66
+
67
+ def url=(url)
68
+ if url !~ /^http/
69
+ url = "http://#{url}"
70
+ end
71
+ @url = url
72
+ end
73
+
74
+ def create_importing_job
75
+ response = RestClient.post("#{self.url}/command/core/create-importing-job", nil)
76
+ job_id = JSON[response]["jobID"]
77
+ Job.new(self, job_id)
78
+ end
79
+
80
+ def create_project(filename, param = {})
81
+
82
+ options = {}
83
+ options[:format] = param[:format]
84
+ options[:projectName] = param[:name] || "File \"#{filename}\" uploaded on #{Time.now}"
85
+ options[:encoding] = param[:encoding] || ""
86
+ options[:separator] = param[:separator] || "\\t"
87
+ options[:ignoreLines] = param[:ignoreLines] || -1
88
+ options[:headerLines] = param[:headerLines] || 0
89
+ options[:skipDataLines] = param[:skipDataLines] || 0
90
+ options[:limit] = param[:limit] || 1_000_000
91
+ options[:storeBlankRows] = param[:storeBlankRows] || true
92
+ options[:guessCellValueTypes] = param[:guessCellValueTypes] || true
93
+ options[:processQuotes] = param[:processQuotes] || false
94
+ options[:storeBlankCellsAsNulls] = param[:storeBlankCellsAsNulls] || true
95
+ options[:includeFileSources] = param[:includeFileSources] || false
96
+
97
+ job = create_importing_job
98
+ job.load_raw_data(filename)
99
+ project = job.create_project(options)
100
+ project
101
+ ensure
102
+ job.cancel if job
103
+ end
104
+
105
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_refine
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Cheng Guang-Nan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rest-client
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: trollop
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description:
47
+ email: guangnan@chengguangnan.com
48
+ executables:
49
+ - upload-to-refine
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - lib/google_refine.rb
54
+ - bin/upload-to-refine
55
+ homepage: https://github.com/guangnan/google_refine
56
+ licenses: []
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements: []
74
+ rubyforge_project:
75
+ rubygems_version: 1.8.23
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Upload files to refine using command line or programmtically.
79
+ test_files: []