google_refine 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/upload-to-refine +43 -0
- data/lib/google_refine.rb +105 -0
- metadata +79 -0
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path('../../lib/google_refine', __FILE__)
|
4
|
+
|
5
|
+
require 'trollop'
|
6
|
+
require 'tempfile'
|
7
|
+
|
8
|
+
opts = Trollop::options do
|
9
|
+
opt :host, "Address of Google Refine", :default => '0.0.0.0:3333'
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
Signal.trap("INT") { puts "Terminating: #{$$}" }
|
14
|
+
|
15
|
+
host = opts[:host]
|
16
|
+
|
17
|
+
file = nil
|
18
|
+
|
19
|
+
if ARGV.size > 0
|
20
|
+
file = ARGV.shift
|
21
|
+
abort "File \"#{file}\" does not exists." if not File.exists?(file)
|
22
|
+
end
|
23
|
+
|
24
|
+
tempfile = nil
|
25
|
+
|
26
|
+
if file.nil?
|
27
|
+
if STDIN.tty?
|
28
|
+
abort "The STDIN should not be a TTY."
|
29
|
+
else
|
30
|
+
tempfile = Tempfile.new('upload-to-pipelines')
|
31
|
+
tempfile.close
|
32
|
+
|
33
|
+
IO.copy_stream(STDIN, tempfile)
|
34
|
+
file = tempfile.path
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
refine = Refine.new(host)
|
40
|
+
|
41
|
+
puts refine.create_project(file, :projectName => file)
|
42
|
+
|
43
|
+
tempfile.unlink if not tempfile.nil?
|
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'rest_client'
|
2
|
+
require 'json'
|
3
|
+
|
4
|
+
class Project
|
5
|
+
|
6
|
+
attr_accessor :id
|
7
|
+
attr_accessor :refine
|
8
|
+
|
9
|
+
def initialize(refine, id)
|
10
|
+
self.refine = refine
|
11
|
+
self.id = id
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_s
|
15
|
+
"#{self.refine.url}/project?project=#{self.id}"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class Job
|
20
|
+
|
21
|
+
attr_accessor :id
|
22
|
+
attr_accessor :refine
|
23
|
+
|
24
|
+
def initialize(refine, id)
|
25
|
+
self.refine = refine
|
26
|
+
self.id = id
|
27
|
+
end
|
28
|
+
|
29
|
+
def load_raw_data(filename)
|
30
|
+
RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=load-raw-data", :upload => File.new(filename, "rb"))
|
31
|
+
|
32
|
+
while true
|
33
|
+
sleep 2
|
34
|
+
status = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
|
35
|
+
break if JSON[status]["job"]["config"]["state"] == "ready"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def create_project(options)
|
40
|
+
RestClient.post("#{self.refine.url}/command/core/importing-controller?controller=core%2Fdefault-importing-controller&jobID=#{self.id}&subCommand=create-project",
|
41
|
+
:format => "text/line-based/*sv",
|
42
|
+
:options => options.to_json)
|
43
|
+
|
44
|
+
project_id = nil
|
45
|
+
while project_id.nil?
|
46
|
+
sleep 2
|
47
|
+
response = RestClient.post("#{self.refine.url}/command/core/get-importing-job-status?jobID=#{self.id}", nil)
|
48
|
+
project_id = JSON[response]["job"]["config"]["projectID"]
|
49
|
+
end
|
50
|
+
|
51
|
+
Project.new(self.refine, project_id)
|
52
|
+
end
|
53
|
+
|
54
|
+
def cancel
|
55
|
+
RestClient.post("#{self.refine.url}/command/core/cancel-importing-job?jobID=#{self.id}", nil)
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
class Refine
|
61
|
+
attr_accessor :url
|
62
|
+
|
63
|
+
def initialize(url)
|
64
|
+
self.url = url
|
65
|
+
end
|
66
|
+
|
67
|
+
def url=(url)
|
68
|
+
if url !~ /^http/
|
69
|
+
url = "http://#{url}"
|
70
|
+
end
|
71
|
+
@url = url
|
72
|
+
end
|
73
|
+
|
74
|
+
def create_importing_job
|
75
|
+
response = RestClient.post("#{self.url}/command/core/create-importing-job", nil)
|
76
|
+
job_id = JSON[response]["jobID"]
|
77
|
+
Job.new(self, job_id)
|
78
|
+
end
|
79
|
+
|
80
|
+
def create_project(filename, param = {})
|
81
|
+
|
82
|
+
options = {}
|
83
|
+
options[:format] = param[:format]
|
84
|
+
options[:projectName] = param[:name] || "File \"#{filename}\" uploaded on #{Time.now}"
|
85
|
+
options[:encoding] = param[:encoding] || ""
|
86
|
+
options[:separator] = param[:separator] || "\\t"
|
87
|
+
options[:ignoreLines] = param[:ignoreLines] || -1
|
88
|
+
options[:headerLines] = param[:headerLines] || 0
|
89
|
+
options[:skipDataLines] = param[:skipDataLines] || 0
|
90
|
+
options[:limit] = param[:limit] || 1_000_000
|
91
|
+
options[:storeBlankRows] = param[:storeBlankRows] || true
|
92
|
+
options[:guessCellValueTypes] = param[:guessCellValueTypes] || true
|
93
|
+
options[:processQuotes] = param[:processQuotes] || false
|
94
|
+
options[:storeBlankCellsAsNulls] = param[:storeBlankCellsAsNulls] || true
|
95
|
+
options[:includeFileSources] = param[:includeFileSources] || false
|
96
|
+
|
97
|
+
job = create_importing_job
|
98
|
+
job.load_raw_data(filename)
|
99
|
+
project = job.create_project(options)
|
100
|
+
project
|
101
|
+
ensure
|
102
|
+
job.cancel if job
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
metadata
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: google_refine
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Cheng Guang-Nan
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-07-01 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rest-client
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: trollop
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description:
|
47
|
+
email: guangnan@chengguangnan.com
|
48
|
+
executables:
|
49
|
+
- upload-to-refine
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- lib/google_refine.rb
|
54
|
+
- bin/upload-to-refine
|
55
|
+
homepage: https://github.com/guangnan/google_refine
|
56
|
+
licenses: []
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
none: false
|
63
|
+
requirements:
|
64
|
+
- - ! '>='
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: '0'
|
67
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
|
+
none: false
|
69
|
+
requirements:
|
70
|
+
- - ! '>='
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: '0'
|
73
|
+
requirements: []
|
74
|
+
rubyforge_project:
|
75
|
+
rubygems_version: 1.8.23
|
76
|
+
signing_key:
|
77
|
+
specification_version: 3
|
78
|
+
summary: Upload files to refine using command line or programmtically.
|
79
|
+
test_files: []
|