theman 0.0.5 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/README.rdoc +64 -44
- data/lib/theman/agency/columns.rb +95 -0
- data/lib/theman/agency/table.rb +24 -0
- data/lib/theman/agency.rb +98 -90
- data/lib/theman/object.rb +22 -0
- data/lib/theman/version.rb +1 -1
- data/lib/theman.rb +7 -0
- data/spec/agency_spec.rb +246 -0
- data/spec/columns_spec.rb +75 -0
- data/spec/fixtures/temp_eight.csv +249 -0
- data/spec/fixtures/temp_seven.csv +4 -0
- data/spec/object_spec.rb +37 -0
- data/spec/table_spec.rb +5 -0
- data/theman.gemspec +0 -1
- metadata +14 -18
- data/spec/theman_spec.rb +0 -212
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/README.rdoc
CHANGED
@@ -2,39 +2,49 @@
|
|
2
2
|
|
3
3
|
The man getting you down?
|
4
4
|
|
5
|
-
|
6
|
-
you may only be looking for certain records that match some criteria,
|
7
|
-
enter Theman.
|
5
|
+
Theman lets you import lots of data into postgres very fast.
|
8
6
|
|
9
7
|
== Installation
|
10
8
|
|
11
9
|
gem install 'theman'
|
12
10
|
|
13
|
-
== Basic
|
11
|
+
== Basic usage
|
14
12
|
|
15
|
-
|
16
|
-
my_agent.instance.count
|
13
|
+
Say we have a csv file called <tt>sample.csv</tt> with 220 rows:
|
17
14
|
|
18
|
-
|
19
|
-
|
15
|
+
conn = PGconn.open(:dbname => 'test')
|
16
|
+
|
17
|
+
agent = Theman::Agency.new(conn, 'sample.csv')
|
18
|
+
agent.create!
|
20
19
|
|
21
|
-
|
22
|
-
|
20
|
+
res = conn.exec("SELECT count(*) FROM #{agent.table_name}")
|
21
|
+
res.getvalue(0,0)
|
23
22
|
|
24
|
-
|
23
|
+
=> 220
|
25
24
|
|
26
|
-
|
25
|
+
== Basic usage with Active Record and a simple object
|
26
|
+
|
27
|
+
conn = ActiveRecord::Base.connection.raw_connection
|
28
|
+
|
29
|
+
agent = Theman::Agency.new(conn, 'sample.csv')
|
30
|
+
agent.create!
|
27
31
|
|
28
|
-
|
29
|
-
|
32
|
+
model = Theman::Object(agent.table_name, ActiveRecord::Base)
|
33
|
+
model.count
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
=> 220
|
36
|
+
|
37
|
+
== Advanced usage with Active Record and an existing model
|
38
|
+
|
39
|
+
Theman will call the +create!+ method if you pass in a block.
|
40
|
+
|
41
|
+
conn = ActiveRecord::Base.connection.raw_connection
|
42
|
+
|
43
|
+
agent = Theman::Agency.new conn, 'ugly.csv' |ag|
|
44
|
+
ag.nulls /"N"/, /"UNKNOWN"/, /""/
|
45
|
+
ag.seds "-n -e :a -e '1,15!{P;N;D;};N;ba'"
|
46
|
+
ag.delimiter "|"
|
47
|
+
ag.table do |t|
|
38
48
|
t.string :name, :limit => 50
|
39
49
|
t.date :date
|
40
50
|
t.integer :ext_id
|
@@ -42,8 +52,11 @@ your back (for now).
|
|
42
52
|
t.boolean :exited
|
43
53
|
end
|
44
54
|
end
|
45
|
-
|
46
|
-
|
55
|
+
|
56
|
+
MyModel.table_name = agent.table_name
|
57
|
+
MyModel.where(:exited => true).count
|
58
|
+
|
59
|
+
=> 220
|
47
60
|
|
48
61
|
In the above example we omitted the last 15 rows, made some things null,
|
49
62
|
added a primary key and changed some column data types to something else.
|
@@ -52,27 +65,32 @@ If you do not provide a table block your columns will be VARCHAR(255); you
|
|
52
65
|
can cherry pick the columns that you want to change the data types for.
|
53
66
|
|
54
67
|
The temp table has no id column by default, but you can add one using the options
|
55
|
-
hash or calling add_primary_key
|
68
|
+
hash or calling +add_primary_key+, this will add the agents_pkey column.
|
56
69
|
|
57
|
-
|
58
|
-
and Theman will not create a table, in this case you will need to call everything
|
59
|
-
explicitly:
|
70
|
+
WARNING: if you have user input in your sed commands, don't.
|
60
71
|
|
61
|
-
|
62
|
-
smith.stream 'real_ugly.csv'
|
63
|
-
smith.create_table
|
64
|
-
smith.pipe_it
|
72
|
+
== Drop on commit
|
65
73
|
|
66
|
-
|
74
|
+
If you want to use <tt>ON COMMIT DROP</tt> you will need to pass in
|
75
|
+
<tt>:on_commit => :drop</tt> into options and do everthing inside a transacton.
|
76
|
+
|
77
|
+
agent = Theman::Agency.new conn, csv, :on_commit => :drop
|
78
|
+
|
79
|
+
agent.transaction do
|
80
|
+
agent.create!
|
81
|
+
# do stuff
|
82
|
+
end
|
83
|
+
|
84
|
+
== No headers
|
67
85
|
|
68
|
-
|
69
|
-
|
86
|
+
If you data does not have headers pass into options <tt>:headers => false</tt>, but
|
87
|
+
each column must be specified or the import will fail.
|
70
88
|
|
71
89
|
== Dates
|
72
90
|
|
73
91
|
Ah dates, the joy! Use datestyle to tell Theman to then tell PostgreSQL:
|
74
92
|
|
75
|
-
|
93
|
+
agent = Theman::Agency.new conn, 'uber_foie_gras.csv' do |schmit|
|
76
94
|
schmit.datestyle 'European'
|
77
95
|
schmit.table do |t|
|
78
96
|
t.date :start_date
|
@@ -83,34 +101,36 @@ Ah dates, the joy! Use datestyle to tell Theman to then tell PostgreSQL:
|
|
83
101
|
Refer to PostgreSQL docs for more info in the mean time here is some
|
84
102
|
copy and paste action:
|
85
103
|
|
86
|
-
|
104
|
+
<b>ISO</b>
|
87
105
|
|
88
106
|
Use ISO 8601-style dates and times (YYYY-MM-DD HH:MM:SS). This is the default.
|
89
107
|
|
90
|
-
|
108
|
+
<b>SQL</b>
|
91
109
|
|
92
110
|
Use Oracle/Ingres-style dates and times.
|
93
111
|
|
94
|
-
|
112
|
+
<b>PostgreSQL</b>
|
95
113
|
|
96
114
|
Use traditional PostgreSQL format.
|
97
115
|
|
98
|
-
|
116
|
+
<b>German</b>
|
99
117
|
|
100
118
|
dd.mm.yyyy
|
101
119
|
|
102
|
-
|
120
|
+
<b>European</b>
|
103
121
|
|
104
122
|
dd/mm/yyyy
|
105
123
|
|
106
|
-
|
124
|
+
<b>US</b>
|
107
125
|
|
108
126
|
mm/dd/yyyy
|
109
127
|
|
110
|
-
==
|
128
|
+
== My table is empty?
|
111
129
|
|
112
|
-
|
113
|
-
|
130
|
+
PostgreSQL <tt>COPY</tt> requires that the data be well formed, any rows that
|
131
|
+
are different to what is expected by the table and the whole import will fail.
|
132
|
+
If you are importing very large files and the import fails space on disc will still
|
133
|
+
be used untill <tt>VACUUM</tt>.
|
114
134
|
|
115
135
|
== Copyright
|
116
136
|
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Theman
|
2
|
+
class Agency
|
3
|
+
class Columns
|
4
|
+
attr_accessor :column
|
5
|
+
attr_reader :connection
|
6
|
+
|
7
|
+
def initialize(conn)
|
8
|
+
@connection = conn
|
9
|
+
@columns = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_sql #:nodoc
|
13
|
+
@columns.map{|column| column_to_sql(*column)}.join(', ')
|
14
|
+
end
|
15
|
+
|
16
|
+
%w( string text integer float decimal datetime timestamp time date binary boolean ).each do |type|
|
17
|
+
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
18
|
+
def #{type}(name, *args)
|
19
|
+
column(name, '#{type}', *args)
|
20
|
+
end
|
21
|
+
EOV
|
22
|
+
end
|
23
|
+
|
24
|
+
def symbolize(name) #:nodoc
|
25
|
+
name.is_a?(Symbol) ? name : name.gsub(/ /,"_").gsub(/\W/, "").downcase.to_sym
|
26
|
+
end
|
27
|
+
|
28
|
+
def column(name, type, *args) #:nodoc
|
29
|
+
sym_col = symbolize(name)
|
30
|
+
@columns.each_with_index do |column, index|
|
31
|
+
if column[0] == sym_col
|
32
|
+
@columns[index] = [sym_col, type, *args]
|
33
|
+
return
|
34
|
+
end
|
35
|
+
end
|
36
|
+
@columns << [sym_col, type, *args]
|
37
|
+
end
|
38
|
+
|
39
|
+
def column_to_sql(name, type, options = {}) #:nodoc
|
40
|
+
sql = [quote_column_name(name)]
|
41
|
+
case type
|
42
|
+
when 'integer'
|
43
|
+
if options[:limit]
|
44
|
+
case options[:limit]
|
45
|
+
when 1, 2;
|
46
|
+
sql << 'smallint'
|
47
|
+
when 3, 4;
|
48
|
+
sql << 'integer'
|
49
|
+
when 5..8;
|
50
|
+
sql << 'bigint'
|
51
|
+
else
|
52
|
+
raise ArgumentError, "No integer type has byte size #{limit}."
|
53
|
+
end
|
54
|
+
else
|
55
|
+
sql << 'integer'
|
56
|
+
end
|
57
|
+
when 'decimal'
|
58
|
+
sql << 'double precision'
|
59
|
+
when 'float'
|
60
|
+
sql << 'double precision'
|
61
|
+
when 'string'
|
62
|
+
if options[:limit]
|
63
|
+
sql << "character varying(#{options[:limit]})"
|
64
|
+
else
|
65
|
+
sql << 'character varying(255)'
|
66
|
+
end
|
67
|
+
when 'binary'
|
68
|
+
sql << 'oid'
|
69
|
+
when 'time'
|
70
|
+
sql << 'time without time zone'
|
71
|
+
when 'datetime'
|
72
|
+
sql << 'timestamp without time zone'
|
73
|
+
when 'timestamp'
|
74
|
+
sql << 'timestamp without time zone'
|
75
|
+
else
|
76
|
+
sql << type
|
77
|
+
end
|
78
|
+
|
79
|
+
if options[:null] == false
|
80
|
+
sql << 'NOT NULL'
|
81
|
+
end
|
82
|
+
|
83
|
+
if options[:default]
|
84
|
+
sql << "DEFAULT #{options[:default]}"
|
85
|
+
end
|
86
|
+
|
87
|
+
sql.join(' ')
|
88
|
+
end
|
89
|
+
|
90
|
+
def quote_column_name(name) #:nodoc
|
91
|
+
@connection.quote_ident(name.to_s)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Theman
|
2
|
+
class Agency
|
3
|
+
class Table
|
4
|
+
def initialize(name, columns, temporary = nil, on_commit = nil)
|
5
|
+
@name = name
|
6
|
+
@columns = columns
|
7
|
+
@temporary = temporary
|
8
|
+
@on_commit = on_commit
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_sql(sql = []) #:nodoc
|
12
|
+
sql << ["CREATE"]
|
13
|
+
sql << "TEMPORARY TABLE" unless @temporary == false
|
14
|
+
sql << @name
|
15
|
+
sql << "(#{@columns})"
|
16
|
+
unless @on_commit.nil?
|
17
|
+
sql << "ON COMMIT"
|
18
|
+
sql << @on_commit.to_s.upcase.gsub(/_/," ")
|
19
|
+
end
|
20
|
+
sql.join(" ")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/theman/agency.rb
CHANGED
@@ -1,129 +1,147 @@
|
|
1
1
|
module Theman
|
2
2
|
class Agency
|
3
|
-
attr_reader :
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
3
|
+
attr_reader :columns, :table_name, :connection
|
4
|
+
# create a new agent object - if a block is passed create! is called
|
5
|
+
#
|
6
|
+
# ==== Parameters
|
7
|
+
# * +conn+ - A database connection from the <tt>PGconn</tt> class
|
8
|
+
# or <tt>ActiveRecord::Base.connection.raw_connection</tt> which
|
9
|
+
# is the same class.
|
10
|
+
# * +stream+ - path to the data file.
|
11
|
+
# * +options+ - Additional options are <tt>:temporary</tt>,
|
12
|
+
# <tt>:on_commit</tt> and <tt>:headers</tt>
|
13
|
+
#
|
14
|
+
# ==== Examples
|
15
|
+
# # Update all customers with the given attributes
|
16
|
+
# conn = PGconn.open(:dbname => 'test')
|
17
|
+
# agent = Theman::Agency.new(conn, 'sample.csv')
|
18
|
+
# agent.create!
|
19
|
+
# res = conn.exec("SELECT count(*) FROM #{agent.table_name}")
|
20
|
+
# res.getvalue(0,0)
|
21
|
+
def initialize(conn, stream, options = {}, &block)
|
22
|
+
@stream = stream
|
23
|
+
@connection = conn
|
24
|
+
@options = options
|
25
|
+
|
26
|
+
@table_name = sprintf "agent%010d", rand(100000000)
|
27
|
+
@columns = Columns.new(conn)
|
28
|
+
@stream_columns_set = false
|
29
|
+
|
30
|
+
if block_given?
|
31
|
+
yield self
|
32
|
+
create!
|
29
33
|
end
|
30
34
|
end
|
31
|
-
|
32
|
-
|
33
|
-
|
35
|
+
|
36
|
+
# create a transaction block for use with :on_commit => :drop
|
37
|
+
def transaction(&block)
|
38
|
+
connection.exec "BEGIN;"
|
39
|
+
yield
|
40
|
+
connection.exec "COMMIT;"
|
34
41
|
end
|
35
42
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
42
|
-
EOV
|
43
|
+
def create_stream_columns #:nodoc
|
44
|
+
@stream_columns_set = true
|
45
|
+
headers.split(delimiter_regexp).each do |column|
|
46
|
+
@columns.string column
|
47
|
+
end
|
43
48
|
end
|
44
49
|
|
45
|
-
|
46
|
-
|
47
|
-
@column_names.merge! column_name.to_sym => [column_name, column_type, *args]
|
50
|
+
def headers #:nodoc
|
51
|
+
File.open(@stream, "r"){ |infile| infile.gets }
|
48
52
|
end
|
49
|
-
|
53
|
+
|
54
|
+
# create default columns from stream and replace selected
|
55
|
+
# columns with custom data types from block
|
56
|
+
def table(&block)
|
57
|
+
create_stream_columns unless @options[:headers] == false
|
58
|
+
yield @columns
|
59
|
+
end
|
60
|
+
|
61
|
+
# the location of the data to be sent to Postgres via STDIN (requires a header row)
|
50
62
|
def stream(arg)
|
51
63
|
@stream = arg
|
52
64
|
end
|
53
|
-
|
65
|
+
|
66
|
+
# datestyle of date columns
|
54
67
|
def datestyle(arg)
|
55
68
|
@datestyle = arg
|
56
69
|
end
|
57
|
-
|
70
|
+
|
71
|
+
# values in stream to replace with NULL
|
58
72
|
def nulls(*args)
|
59
73
|
@nulls = args
|
60
74
|
end
|
61
75
|
|
76
|
+
# custom seds to parse stream with
|
62
77
|
def seds(*args)
|
63
78
|
@seds = args
|
64
79
|
end
|
65
80
|
|
81
|
+
# delimter used in stream - comma is the default
|
66
82
|
def delimiter(arg)
|
67
83
|
@delimiter = arg
|
68
84
|
end
|
69
85
|
|
70
|
-
def
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
def psql_copy(psql = [])
|
75
|
-
psql << "COPY #{@instance.table_name} FROM STDIN WITH"
|
86
|
+
def psql_copy(psql = []) #:nodoc
|
87
|
+
psql << "COPY #{table_name} FROM STDIN WITH"
|
76
88
|
psql << "DELIMITER '#{@delimiter}'" unless @delimiter.nil?
|
77
|
-
psql << "CSV
|
89
|
+
psql << "CSV"
|
90
|
+
psql << "HEADER" unless @options[:headers] == false
|
78
91
|
psql
|
79
92
|
end
|
80
93
|
|
81
|
-
def psql_command(psql = [])
|
94
|
+
def psql_command(psql = []) #:nodoc
|
82
95
|
psql << "SET DATESTYLE TO #{@datestyle}" unless @datestyle.nil?
|
83
96
|
psql << psql_copy.join(" ")
|
84
97
|
psql
|
85
98
|
end
|
86
99
|
|
87
|
-
def sed_command(sed = [])
|
100
|
+
def sed_command(sed = []) #:nodoc
|
88
101
|
sed << nulls_to_sed unless @nulls.nil?
|
89
102
|
sed << @seds unless @seds.nil?
|
90
103
|
sed
|
91
104
|
end
|
92
105
|
|
93
|
-
def nulls_to_sed
|
106
|
+
def nulls_to_sed #:nodoc
|
94
107
|
@nulls.map do |regex|
|
95
108
|
"-e 's/#{regex.source}//g'"
|
96
109
|
end
|
97
110
|
end
|
98
111
|
|
99
|
-
|
100
|
-
|
101
|
-
Regexp.new(@delimiter.nil? ? "," : "\\#{@delimiter}")
|
112
|
+
def delimiter_regexp #:nodoc
|
113
|
+
@delimiter_regexp ||= Regexp.new(@delimiter.nil? ? "," : "\\#{@delimiter}")
|
102
114
|
end
|
103
|
-
|
104
|
-
#
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
line.split(delimiter_regexp).each do |col|
|
112
|
-
column_name = symbolize(col)
|
113
|
-
if custom = @column_names.fetch(column_name, nil)
|
114
|
-
t.column(*custom)
|
115
|
-
else
|
116
|
-
t.string column_name
|
117
|
-
end
|
118
|
-
end
|
119
|
-
break
|
120
|
-
end
|
115
|
+
|
116
|
+
# Postgress COPY command using STDIN
|
117
|
+
# - reads chunks of 8192 bytes to save memory
|
118
|
+
# System command for IO subprocesses are piped to
|
119
|
+
# take advantage of multi cores
|
120
|
+
def create!
|
121
|
+
unless @stream_columns_set || @options[:headers] == false
|
122
|
+
create_stream_columns
|
121
123
|
end
|
124
|
+
connection.exec Table.new(table_name, @columns.to_sql, @options[:temporary], @options[:on_commit]).to_sql
|
125
|
+
pipe_it
|
122
126
|
end
|
123
127
|
|
124
|
-
#
|
125
|
-
|
126
|
-
|
128
|
+
# adds a serial column called agents_pkey and sets as primary key
|
129
|
+
def add_primary_key!
|
130
|
+
connection.exec "ALTER TABLE #{table_name} ADD COLUMN agents_pkey serial PRIMARY KEY;"
|
131
|
+
end
|
132
|
+
|
133
|
+
# analyzes the table for efficent query contstruction on tables larger than ~1000 tuples
|
134
|
+
def analyze!
|
135
|
+
connection.exec "ANALYZE #{table_name};"
|
136
|
+
end
|
137
|
+
|
138
|
+
# explicitly drop table
|
139
|
+
def drop!
|
140
|
+
connection.exec "DROP TABLE #{table_name};"
|
141
|
+
@table_name = nil
|
142
|
+
end
|
143
|
+
|
144
|
+
def system_command #:nodoc
|
127
145
|
unless sed_command.empty?
|
128
146
|
"cat #{@stream} | sed #{sed_command.join(" | sed ")}"
|
129
147
|
else
|
@@ -131,27 +149,17 @@ module Theman
|
|
131
149
|
end
|
132
150
|
end
|
133
151
|
|
134
|
-
|
135
|
-
|
136
|
-
def add_primary_key
|
137
|
-
instance.connection.raw_connection.query "ALTER TABLE #{instance.table_name} ADD COLUMN agents_pkey serial PRIMARY KEY;"
|
138
|
-
end
|
139
|
-
|
140
|
-
# use postgress COPY command using STDIN with CSV HEADER
|
141
|
-
# reads chunks of 8192 bytes to save memory
|
142
|
-
def pipe_it(l = "")
|
143
|
-
raise "table does not exist" unless instance.table_exists?
|
144
|
-
raw = instance.connection.raw_connection
|
145
|
-
raw.query psql_command.join("; ")
|
152
|
+
def pipe_it(l = "") #:nodoc
|
153
|
+
connection.exec psql_command.join("; ")
|
146
154
|
f = IO.popen(system_command)
|
147
155
|
begin
|
148
156
|
while f.read(8192, l)
|
149
|
-
|
157
|
+
connection.put_copy_data l
|
150
158
|
end
|
151
159
|
rescue EOFError
|
152
160
|
f.close
|
153
161
|
end
|
154
|
-
|
162
|
+
connection.put_copy_end
|
155
163
|
end
|
156
164
|
end
|
157
165
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Theman
|
2
|
+
class Object
|
3
|
+
def self.new(table_name, parent = ::Object, conn = nil)
|
4
|
+
Class.new(parent) do
|
5
|
+
unless conn.nil?
|
6
|
+
@@connection = conn
|
7
|
+
end
|
8
|
+
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
9
|
+
set_table_name "#{table_name}"
|
10
|
+
|
11
|
+
def table_name
|
12
|
+
"#{table_name}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
"Agent (#{table_name})"
|
17
|
+
end
|
18
|
+
EOV
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/theman/version.rb
CHANGED
data/lib/theman.rb
CHANGED