google_refine 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/upload-to-refine +43 -0
- data/lib/google_refine.rb +105 -0
- metadata +79 -0
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path('../../lib/google_refine', __FILE__)
|
4
|
+
|
5
|
+
require 'trollop'
|
6
|
+
require 'tempfile'
|
7
|
+
|
8
|
+
opts = Trollop::options do
|
9
|
+
opt :host, "Address of Google Refine", :default => '0.0.0.0:3333'
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
Signal.trap("INT") { puts "Terminating: #{$$}" }
|
14
|
+
|
15
|
+
host = opts[:host]
|
16
|
+
|
17
|
+
file = nil
|
18
|
+
|
19
|
+
if ARGV.size > 0
|
20
|
+
file = ARGV.shift
|
21
|
+
abort "File \"#{file}\" does not exists." if not File.exists?(file)
|
22
|
+
end
|
23
|
+
|
24
|
+
tempfile = nil
|
25
|
+
|
26
|
+
if file.nil?
|
27
|
+
if STDIN.tty?
|
28
|
+
abort "The STDIN should not be a TTY."
|
29
|
+
else
|
30
|
+
tempfile = Tempfile.new('upload-to-pipelines')
|
31
|
+
tempfile.close
|
32
|
+
|
33
|
+
IO.copy_stream(STDIN, tempfile)
|
34
|
+
file = tempfile.path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
refine = Refine.new(host)
|
40
|
+
|
41
|
+
puts refine.create_project(file, :projectName => file)
|
42
|
+
|
43
|
+
tempfile.unlink if not tempfile.nil?
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'rest_client'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class Project
|
5
|
+
|
6
|
+
attr_accessor :id
|
7
|
+
attr_accessor :refine
|
8
|
+
|
9
|
+
def initialize(refine, id)
|
10
|
+
self.refine = refine
|
11
|
+
self.id = id
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_s
|
15
|
+
"#{self.refine.url}/project?project=#{self.id}"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class Job
|
20
|
+
|
21
|
+
attr_accessor :id
|
22
|
+
attr_accessor :refine
|
23
|
+
|
24
|
+
def initialize(refine, id)
|
25
|
+
self.refine = refine
|
26
|
+
self.id = id
|
27
|
+
end
|
28
|
+
|
29
|
+
def load_raw_data(filename)
|
30
|
+
RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=load-raw-data", :upload => File.new(filename, "rb"))
|
31
|
+
|
32
|
+
while true
|
33
|
+
sleep 2
|
34
|
+
status = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
|
35
|
+
break if JSON[status]["job"]["config"]["state"] == "ready"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def create_project(options)
|
40
|
+
RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=create-project",
|
41
|
+
:format => "text/line-based/*sv",
|
42
|
+
:options => options.to_json)
|
43
|
+
|
44
|
+
project_id = nil
|
45
|
+
while project_id.nil?
|
46
|
+
sleep 2
|
47
|
+
response = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
|
48
|
+
project_id = JSON[response]["job"]["config"]["projectID"]
|
49
|
+
end
|
50
|
+
|
51
|
+
Project.new(self.refine, project_id)
|
52
|
+
end
|
53
|
+
|
54
|
+
def cancel
|
55
|
+
RestClient.post("#{self.refine.url}/command/core/cancel-importing-job?jobID=#{self.id}", nil)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
class Refine
|
61
|
+
attr_accessor :url
|
62
|
+
|
63
|
+
def initialize(url)
|
64
|
+
self.url = url
|
65
|
+
end
|
66
|
+
|
67
|
+
def url=(url)
|
68
|
+
if url !~ /^http/
|
69
|
+
url = "http://#{url}"
|
70
|
+
end
|
71
|
+
@url = url
|
72
|
+
end
|
73
|
+
|
74
|
+
def create_importing_job
|
75
|
+
response = RestClient.post("#{self.url}/command/core/create-importing-job", nil)
|
76
|
+
job_id = JSON[response]["jobID"]
|
77
|
+
Job.new(self, job_id)
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_project(filename, param = {})
|
81
|
+
|
82
|
+
options = {}
|
83
|
+
options[:format] = param[:format]
|
84
|
+
options[:projectName] = param[:name] || "File \"#{filename}\" uploaded on #{Time.now}"
|
85
|
+
options[:encoding] = param[:encoding] || ""
|
86
|
+
options[:separator] = param[:separator] || "\\t"
|
87
|
+
options[:ignoreLines] = param[:ignoreLines] || -1
|
88
|
+
options[:headerLines] = param[:headerLines] || 0
|
89
|
+
options[:skipDataLines] = param[:skipDataLines] || 0
|
90
|
+
options[:limit] = param[:limit] || 1_000_000
|
91
|
+
options[:storeBlankRows] = param[:storeBlankRows] || true
|
92
|
+
options[:guessCellValueTypes] = param[:guessCellValueTypes] || true
|
93
|
+
options[:processQuotes] = param[:processQuotes] || false
|
94
|
+
options[:storeBlankCellsAsNulls] = param[:storeBlankCellsAsNulls] || true
|
95
|
+
options[:includeFileSources] = param[:includeFileSources] || false
|
96
|
+
|
97
|
+
job = create_importing_job
|
98
|
+
job.load_raw_data(filename)
|
99
|
+
project = job.create_project(options)
|
100
|
+
project
|
101
|
+
ensure
|
102
|
+
job.cancel if job
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google_refine
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Cheng Guang-Nan
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rest-client
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: trollop
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description:
|
47
|
+
email: guangnan@chengguangnan.com
|
48
|
+
executables:
|
49
|
+
- upload-to-refine
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- lib/google_refine.rb
|
54
|
+
- bin/upload-to-refine
|
55
|
+
homepage: https://github.com/guangnan/google_refine
|
56
|
+
licenses: []
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ! '>='
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements: []
|
74
|
+
rubyforge_project:
|
75
|
+
rubygems_version: 1.8.23
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: Upload files to refine using command line or programmtically.
|
79
|
+
test_files: []
|