google_refine 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/bin/upload-to-refine +43 -0
  2. data/lib/google_refine.rb +105 -0
  3. metadata +79 -0
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path('../../lib/google_refine', __FILE__)
4
+
5
+ require 'trollop'
6
+ require 'tempfile'
7
+
8
+ opts = Trollop::options do
9
+ opt :host, "Address of Google Refine", :default => '0.0.0.0:3333'
10
+ end
11
+
12
+
13
+ Signal.trap("INT") { puts "Terminating: #{$$}" }
14
+
15
+ host = opts[:host]
16
+
17
+ file = nil
18
+
19
+ if ARGV.size > 0
20
+ file = ARGV.shift
21
+ abort "File \"#{file}\" does not exists." if not File.exists?(file)
22
+ end
23
+
24
+ tempfile = nil
25
+
26
+ if file.nil?
27
+ if STDIN.tty?
28
+ abort "The STDIN should not be a TTY."
29
+ else
30
+ tempfile = Tempfile.new('upload-to-pipelines')
31
+ tempfile.close
32
+
33
+ IO.copy_stream(STDIN, tempfile)
34
+ file = tempfile.path
35
+ end
36
+ end
37
+
38
+
39
+ refine = Refine.new(host)
40
+
41
+ puts refine.create_project(file, :projectName => file)
42
+
43
+ tempfile.unlink if not tempfile.nil?
@@ -0,0 +1,105 @@
1
+ require 'rest_client'
2
+ require 'json'
3
+
4
+ class Project
5
+
6
+ attr_accessor :id
7
+ attr_accessor :refine
8
+
9
+ def initialize(refine, id)
10
+ self.refine = refine
11
+ self.id = id
12
+ end
13
+
14
+ def to_s
15
+ "#{self.refine.url}/project?project=#{self.id}"
16
+ end
17
+ end
18
+
19
+ class Job
20
+
21
+ attr_accessor :id
22
+ attr_accessor :refine
23
+
24
+ def initialize(refine, id)
25
+ self.refine = refine
26
+ self.id = id
27
+ end
28
+
29
+ def load_raw_data(filename)
30
+ RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=load-raw-data", :upload => File.new(filename, "rb"))
31
+
32
+ while true
33
+ sleep 2
34
+ status = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
35
+ break if JSON[status]["job"]["config"]["state"] == "ready"
36
+ end
37
+ end
38
+
39
+ def create_project(options)
40
+ RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=create-project",
41
+ :format => "text/line-based/*sv",
42
+ :options => options.to_json)
43
+
44
+ project_id = nil
45
+ while project_id.nil?
46
+ sleep 2
47
+ response = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
48
+ project_id = JSON[response]["job"]["config"]["projectID"]
49
+ end
50
+
51
+ Project.new(self.refine, project_id)
52
+ end
53
+
54
+ def cancel
55
+ RestClient.post("#{self.refine.url}/command/core/cancel-importing-job?jobID=#{self.id}", nil)
56
+ end
57
+
58
+ end
59
+
60
+ class Refine
61
+ attr_accessor :url
62
+
63
+ def initialize(url)
64
+ self.url = url
65
+ end
66
+
67
+ def url=(url)
68
+ if url !~ /^http/
69
+ url = "http://#{url}"
70
+ end
71
+ @url = url
72
+ end
73
+
74
+ def create_importing_job
75
+ response = RestClient.post("#{self.url}/command/core/create-importing-job", nil)
76
+ job_id = JSON[response]["jobID"]
77
+ Job.new(self, job_id)
78
+ end
79
+
80
+ def create_project(filename, param = {})
81
+
82
+ options = {}
83
+ options[:format] = param[:format]
84
+ options[:projectName] = param[:name] || "File \"#{filename}\" uploaded on #{Time.now}"
85
+ options[:encoding] = param[:encoding] || ""
86
+ options[:separator] = param[:separator] || "\\t"
87
+ options[:ignoreLines] = param[:ignoreLines] || -1
88
+ options[:headerLines] = param[:headerLines] || 0
89
+ options[:skipDataLines] = param[:skipDataLines] || 0
90
+ options[:limit] = param[:limit] || 1_000_000
91
+ options[:storeBlankRows] = param[:storeBlankRows] || true
92
+ options[:guessCellValueTypes] = param[:guessCellValueTypes] || true
93
+ options[:processQuotes] = param[:processQuotes] || false
94
+ options[:storeBlankCellsAsNulls] = param[:storeBlankCellsAsNulls] || true
95
+ options[:includeFileSources] = param[:includeFileSources] || false
96
+
97
+ job = create_importing_job
98
+ job.load_raw_data(filename)
99
+ project = job.create_project(options)
100
+ project
101
+ ensure
102
+ job.cancel if job
103
+ end
104
+
105
+ end
metadata ADDED
@@ -0,0 +1,79 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: google_refine
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Cheng Guang-Nan
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-07-01 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rest-client
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: trollop
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description:
47
+ email: guangnan@chengguangnan.com
48
+ executables:
49
+ - upload-to-refine
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - lib/google_refine.rb
54
+ - bin/upload-to-refine
55
+ homepage: https://github.com/guangnan/google_refine
56
+ licenses: []
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ none: false
63
+ requirements:
64
+ - - ! '>='
65
+ - !ruby/object:Gem::Version
66
+ version: '0'
67
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
+ none: false
69
+ requirements:
70
+ - - ! '>='
71
+ - !ruby/object:Gem::Version
72
+ version: '0'
73
+ requirements: []
74
+ rubyforge_project:
75
+ rubygems_version: 1.8.23
76
+ signing_key:
77
+ specification_version: 3
78
+ summary: Upload files to refine using command line or programmtically.
79
+ test_files: []