theman 0.0.5 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/README.rdoc +64 -44
- data/lib/theman/agency/columns.rb +95 -0
- data/lib/theman/agency/table.rb +24 -0
- data/lib/theman/agency.rb +98 -90
- data/lib/theman/object.rb +22 -0
- data/lib/theman/version.rb +1 -1
- data/lib/theman.rb +7 -0
- data/spec/agency_spec.rb +246 -0
- data/spec/columns_spec.rb +75 -0
- data/spec/fixtures/temp_eight.csv +249 -0
- data/spec/fixtures/temp_seven.csv +4 -0
- data/spec/object_spec.rb +37 -0
- data/spec/table_spec.rb +5 -0
- data/theman.gemspec +0 -1
- metadata +14 -18
- data/spec/theman_spec.rb +0 -212
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--colour
|
data/README.rdoc
CHANGED
@@ -2,39 +2,49 @@
|
|
2
2
|
|
3
3
|
The man getting you down?
|
4
4
|
|
5
|
-
|
6
|
-
you may only be looking for certain records that match some criteria,
|
7
|
-
enter Theman.
|
5
|
+
Theman lets you import lots of data into postgres very fast.
|
8
6
|
|
9
7
|
== Installation
|
10
8
|
|
11
9
|
gem install 'theman'
|
12
10
|
|
13
|
-
== Basic
|
11
|
+
== Basic usage
|
14
12
|
|
15
|
-
|
16
|
-
my_agent.instance.count
|
13
|
+
Say we have a csv file called <tt>sample.csv</tt> with 220 rows:
|
17
14
|
|
18
|
-
|
19
|
-
|
15
|
+
conn = PGconn.open(:dbname => 'test')
|
16
|
+
|
17
|
+
agent = Theman::Agency.new(conn, 'sample.csv')
|
18
|
+
agent.create!
|
20
19
|
|
21
|
-
|
22
|
-
|
20
|
+
res = conn.exec("SELECT count(*) FROM #{agent.table_name}")
|
21
|
+
res.getvalue(0,0)
|
23
22
|
|
24
|
-
|
23
|
+
=> 220
|
25
24
|
|
26
|
-
|
25
|
+
== Basic usage with Active Record and a simple object
|
26
|
+
|
27
|
+
conn = ActiveRecord::Base.connection.raw_connection
|
28
|
+
|
29
|
+
agent = Theman::Agency.new(conn, 'sample.csv')
|
30
|
+
agent.create!
|
27
31
|
|
28
|
-
|
29
|
-
|
32
|
+
model = Theman::Object(agent.table_name, ActiveRecord::Base)
|
33
|
+
model.count
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
=> 220
|
36
|
+
|
37
|
+
== Advanced usage with Active Record and an existing model
|
38
|
+
|
39
|
+
Theman will call the +create!+ method if you pass in a block.
|
40
|
+
|
41
|
+
conn = ActiveRecord::Base.connection.raw_connection
|
42
|
+
|
43
|
+
agent = Theman::Agency.new conn, 'ugly.csv' |ag|
|
44
|
+
ag.nulls /"N"/, /"UNKNOWN"/, /""/
|
45
|
+
ag.seds "-n -e :a -e '1,15!{P;N;D;};N;ba'"
|
46
|
+
ag.delimiter "|"
|
47
|
+
ag.table do |t|
|
38
48
|
t.string :name, :limit => 50
|
39
49
|
t.date :date
|
40
50
|
t.integer :ext_id
|
@@ -42,8 +52,11 @@ your back (for now).
|
|
42
52
|
t.boolean :exited
|
43
53
|
end
|
44
54
|
end
|
45
|
-
|
46
|
-
|
55
|
+
|
56
|
+
MyModel.table_name = agent.table_name
|
57
|
+
MyModel.where(:exited => true).count
|
58
|
+
|
59
|
+
=> 220
|
47
60
|
|
48
61
|
In the above example we omitted the last 15 rows, made some things null,
|
49
62
|
added a primary key and changed some column data types to something else.
|
@@ -52,27 +65,32 @@ If you do not provide a table block your columns will be VARCHAR(255); you
|
|
52
65
|
can cherry pick the columns that you want to change the data types for.
|
53
66
|
|
54
67
|
The temp table has no id column by default, but you can add one using the options
|
55
|
-
hash or calling add_primary_key
|
68
|
+
hash or calling +add_primary_key+, this will add the agents_pkey column.
|
56
69
|
|
57
|
-
|
58
|
-
and Theman will not create a table, in this case you will need to call everything
|
59
|
-
explicitly:
|
70
|
+
WARNING: if you have user input in your sed commands, don't.
|
60
71
|
|
61
|
-
|
62
|
-
smith.stream 'real_ugly.csv'
|
63
|
-
smith.create_table
|
64
|
-
smith.pipe_it
|
72
|
+
== Drop on commit
|
65
73
|
|
66
|
-
|
74
|
+
If you want to use <tt>ON COMMIT DROP</tt> you will need to pass in
|
75
|
+
<tt>:on_commit => :drop</tt> into options and do everthing inside a transacton.
|
76
|
+
|
77
|
+
agent = Theman::Agency.new conn, csv, :on_commit => :drop
|
78
|
+
|
79
|
+
agent.transaction do
|
80
|
+
agent.create!
|
81
|
+
# do stuff
|
82
|
+
end
|
83
|
+
|
84
|
+
== No headers
|
67
85
|
|
68
|
-
|
69
|
-
|
86
|
+
If you data does not have headers pass into options <tt>:headers => false</tt>, but
|
87
|
+
each column must be specified or the import will fail.
|
70
88
|
|
71
89
|
== Dates
|
72
90
|
|
73
91
|
Ah dates, the joy! Use datestyle to tell Theman to then tell PostgreSQL:
|
74
92
|
|
75
|
-
|
93
|
+
agent = Theman::Agency.new conn, 'uber_foie_gras.csv' do |schmit|
|
76
94
|
schmit.datestyle 'European'
|
77
95
|
schmit.table do |t|
|
78
96
|
t.date :start_date
|
@@ -83,34 +101,36 @@ Ah dates, the joy! Use datestyle to tell Theman to then tell PostgreSQL:
|
|
83
101
|
Refer to PostgreSQL docs for more info in the mean time here is some
|
84
102
|
copy and paste action:
|
85
103
|
|
86
|
-
|
104
|
+
<b>ISO</b>
|
87
105
|
|
88
106
|
Use ISO 8601-style dates and times (YYYY-MM-DD HH:MM:SS). This is the default.
|
89
107
|
|
90
|
-
|
108
|
+
<b>SQL</b>
|
91
109
|
|
92
110
|
Use Oracle/Ingres-style dates and times.
|
93
111
|
|
94
|
-
|
112
|
+
<b>PostgreSQL</b>
|
95
113
|
|
96
114
|
Use traditional PostgreSQL format.
|
97
115
|
|
98
|
-
|
116
|
+
<b>German</b>
|
99
117
|
|
100
118
|
dd.mm.yyyy
|
101
119
|
|
102
|
-
|
120
|
+
<b>European</b>
|
103
121
|
|
104
122
|
dd/mm/yyyy
|
105
123
|
|
106
|
-
|
124
|
+
<b>US</b>
|
107
125
|
|
108
126
|
mm/dd/yyyy
|
109
127
|
|
110
|
-
==
|
128
|
+
== My table is empty?
|
111
129
|
|
112
|
-
|
113
|
-
|
130
|
+
PostgreSQL <tt>COPY</tt> requires that the data be well formed, any rows that
|
131
|
+
are different to what is expected by the table and the whole import will fail.
|
132
|
+
If you are importing very large files and the import fails space on disc will still
|
133
|
+
be used untill <tt>VACUUM</tt>.
|
114
134
|
|
115
135
|
== Copyright
|
116
136
|
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Theman
|
2
|
+
class Agency
|
3
|
+
class Columns
|
4
|
+
attr_accessor :column
|
5
|
+
attr_reader :connection
|
6
|
+
|
7
|
+
def initialize(conn)
|
8
|
+
@connection = conn
|
9
|
+
@columns = []
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_sql #:nodoc
|
13
|
+
@columns.map{|column| column_to_sql(*column)}.join(', ')
|
14
|
+
end
|
15
|
+
|
16
|
+
%w( string text integer float decimal datetime timestamp time date binary boolean ).each do |type|
|
17
|
+
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
18
|
+
def #{type}(name, *args)
|
19
|
+
column(name, '#{type}', *args)
|
20
|
+
end
|
21
|
+
EOV
|
22
|
+
end
|
23
|
+
|
24
|
+
def symbolize(name) #:nodoc
|
25
|
+
name.is_a?(Symbol) ? name : name.gsub(/ /,"_").gsub(/\W/, "").downcase.to_sym
|
26
|
+
end
|
27
|
+
|
28
|
+
def column(name, type, *args) #:nodoc
|
29
|
+
sym_col = symbolize(name)
|
30
|
+
@columns.each_with_index do |column, index|
|
31
|
+
if column[0] == sym_col
|
32
|
+
@columns[index] = [sym_col, type, *args]
|
33
|
+
return
|
34
|
+
end
|
35
|
+
end
|
36
|
+
@columns << [sym_col, type, *args]
|
37
|
+
end
|
38
|
+
|
39
|
+
def column_to_sql(name, type, options = {}) #:nodoc
|
40
|
+
sql = [quote_column_name(name)]
|
41
|
+
case type
|
42
|
+
when 'integer'
|
43
|
+
if options[:limit]
|
44
|
+
case options[:limit]
|
45
|
+
when 1, 2;
|
46
|
+
sql << 'smallint'
|
47
|
+
when 3, 4;
|
48
|
+
sql << 'integer'
|
49
|
+
when 5..8;
|
50
|
+
sql << 'bigint'
|
51
|
+
else
|
52
|
+
raise ArgumentError, "No integer type has byte size #{limit}."
|
53
|
+
end
|
54
|
+
else
|
55
|
+
sql << 'integer'
|
56
|
+
end
|
57
|
+
when 'decimal'
|
58
|
+
sql << 'double precision'
|
59
|
+
when 'float'
|
60
|
+
sql << 'double precision'
|
61
|
+
when 'string'
|
62
|
+
if options[:limit]
|
63
|
+
sql << "character varying(#{options[:limit]})"
|
64
|
+
else
|
65
|
+
sql << 'character varying(255)'
|
66
|
+
end
|
67
|
+
when 'binary'
|
68
|
+
sql << 'oid'
|
69
|
+
when 'time'
|
70
|
+
sql << 'time without time zone'
|
71
|
+
when 'datetime'
|
72
|
+
sql << 'timestamp without time zone'
|
73
|
+
when 'timestamp'
|
74
|
+
sql << 'timestamp without time zone'
|
75
|
+
else
|
76
|
+
sql << type
|
77
|
+
end
|
78
|
+
|
79
|
+
if options[:null] == false
|
80
|
+
sql << 'NOT NULL'
|
81
|
+
end
|
82
|
+
|
83
|
+
if options[:default]
|
84
|
+
sql << "DEFAULT #{options[:default]}"
|
85
|
+
end
|
86
|
+
|
87
|
+
sql.join(' ')
|
88
|
+
end
|
89
|
+
|
90
|
+
def quote_column_name(name) #:nodoc
|
91
|
+
@connection.quote_ident(name.to_s)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Theman
|
2
|
+
class Agency
|
3
|
+
class Table
|
4
|
+
def initialize(name, columns, temporary = nil, on_commit = nil)
|
5
|
+
@name = name
|
6
|
+
@columns = columns
|
7
|
+
@temporary = temporary
|
8
|
+
@on_commit = on_commit
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_sql(sql = []) #:nodoc
|
12
|
+
sql << ["CREATE"]
|
13
|
+
sql << "TEMPORARY TABLE" unless @temporary == false
|
14
|
+
sql << @name
|
15
|
+
sql << "(#{@columns})"
|
16
|
+
unless @on_commit.nil?
|
17
|
+
sql << "ON COMMIT"
|
18
|
+
sql << @on_commit.to_s.upcase.gsub(/_/," ")
|
19
|
+
end
|
20
|
+
sql.join(" ")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/theman/agency.rb
CHANGED
@@ -1,129 +1,147 @@
|
|
1
1
|
module Theman
|
2
2
|
class Agency
|
3
|
-
attr_reader :
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
3
|
+
attr_reader :columns, :table_name, :connection
|
4
|
+
# create a new agent object - if a block is passed create! is called
|
5
|
+
#
|
6
|
+
# ==== Parameters
|
7
|
+
# * +conn+ - A database connection from the <tt>PGconn</tt> class
|
8
|
+
# or <tt>ActiveRecord::Base.connection.raw_connection</tt> which
|
9
|
+
# is the same class.
|
10
|
+
# * +stream+ - path to the data file.
|
11
|
+
# * +options+ - Additional options are <tt>:temporary</tt>,
|
12
|
+
# <tt>:on_commit</tt> and <tt>:headers</tt>
|
13
|
+
#
|
14
|
+
# ==== Examples
|
15
|
+
# # Update all customers with the given attributes
|
16
|
+
# conn = PGconn.open(:dbname => 'test')
|
17
|
+
# agent = Theman::Agency.new(conn, 'sample.csv')
|
18
|
+
# agent.create!
|
19
|
+
# res = conn.exec("SELECT count(*) FROM #{agent.table_name}")
|
20
|
+
# res.getvalue(0,0)
|
21
|
+
def initialize(conn, stream, options = {}, &block)
|
22
|
+
@stream = stream
|
23
|
+
@connection = conn
|
24
|
+
@options = options
|
25
|
+
|
26
|
+
@table_name = sprintf "agent%010d", rand(100000000)
|
27
|
+
@columns = Columns.new(conn)
|
28
|
+
@stream_columns_set = false
|
29
|
+
|
30
|
+
if block_given?
|
31
|
+
yield self
|
32
|
+
create!
|
29
33
|
end
|
30
34
|
end
|
31
|
-
|
32
|
-
|
33
|
-
|
35
|
+
|
36
|
+
# create a transaction block for use with :on_commit => :drop
|
37
|
+
def transaction(&block)
|
38
|
+
connection.exec "BEGIN;"
|
39
|
+
yield
|
40
|
+
connection.exec "COMMIT;"
|
34
41
|
end
|
35
42
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
end
|
42
|
-
EOV
|
43
|
+
def create_stream_columns #:nodoc
|
44
|
+
@stream_columns_set = true
|
45
|
+
headers.split(delimiter_regexp).each do |column|
|
46
|
+
@columns.string column
|
47
|
+
end
|
43
48
|
end
|
44
49
|
|
45
|
-
|
46
|
-
|
47
|
-
@column_names.merge! column_name.to_sym => [column_name, column_type, *args]
|
50
|
+
def headers #:nodoc
|
51
|
+
File.open(@stream, "r"){ |infile| infile.gets }
|
48
52
|
end
|
49
|
-
|
53
|
+
|
54
|
+
# create default columns from stream and replace selected
|
55
|
+
# columns with custom data types from block
|
56
|
+
def table(&block)
|
57
|
+
create_stream_columns unless @options[:headers] == false
|
58
|
+
yield @columns
|
59
|
+
end
|
60
|
+
|
61
|
+
# the location of the data to be sent to Postgres via STDIN (requires a header row)
|
50
62
|
def stream(arg)
|
51
63
|
@stream = arg
|
52
64
|
end
|
53
|
-
|
65
|
+
|
66
|
+
# datestyle of date columns
|
54
67
|
def datestyle(arg)
|
55
68
|
@datestyle = arg
|
56
69
|
end
|
57
|
-
|
70
|
+
|
71
|
+
# values in stream to replace with NULL
|
58
72
|
def nulls(*args)
|
59
73
|
@nulls = args
|
60
74
|
end
|
61
75
|
|
76
|
+
# custom seds to parse stream with
|
62
77
|
def seds(*args)
|
63
78
|
@seds = args
|
64
79
|
end
|
65
80
|
|
81
|
+
# delimter used in stream - comma is the default
|
66
82
|
def delimiter(arg)
|
67
83
|
@delimiter = arg
|
68
84
|
end
|
69
85
|
|
70
|
-
def
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
def psql_copy(psql = [])
|
75
|
-
psql << "COPY #{@instance.table_name} FROM STDIN WITH"
|
86
|
+
def psql_copy(psql = []) #:nodoc
|
87
|
+
psql << "COPY #{table_name} FROM STDIN WITH"
|
76
88
|
psql << "DELIMITER '#{@delimiter}'" unless @delimiter.nil?
|
77
|
-
psql << "CSV
|
89
|
+
psql << "CSV"
|
90
|
+
psql << "HEADER" unless @options[:headers] == false
|
78
91
|
psql
|
79
92
|
end
|
80
93
|
|
81
|
-
def psql_command(psql = [])
|
94
|
+
def psql_command(psql = []) #:nodoc
|
82
95
|
psql << "SET DATESTYLE TO #{@datestyle}" unless @datestyle.nil?
|
83
96
|
psql << psql_copy.join(" ")
|
84
97
|
psql
|
85
98
|
end
|
86
99
|
|
87
|
-
def sed_command(sed = [])
|
100
|
+
def sed_command(sed = []) #:nodoc
|
88
101
|
sed << nulls_to_sed unless @nulls.nil?
|
89
102
|
sed << @seds unless @seds.nil?
|
90
103
|
sed
|
91
104
|
end
|
92
105
|
|
93
|
-
def nulls_to_sed
|
106
|
+
def nulls_to_sed #:nodoc
|
94
107
|
@nulls.map do |regex|
|
95
108
|
"-e 's/#{regex.source}//g'"
|
96
109
|
end
|
97
110
|
end
|
98
111
|
|
99
|
-
|
100
|
-
|
101
|
-
Regexp.new(@delimiter.nil? ? "," : "\\#{@delimiter}")
|
112
|
+
def delimiter_regexp #:nodoc
|
113
|
+
@delimiter_regexp ||= Regexp.new(@delimiter.nil? ? "," : "\\#{@delimiter}")
|
102
114
|
end
|
103
|
-
|
104
|
-
#
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
line.split(delimiter_regexp).each do |col|
|
112
|
-
column_name = symbolize(col)
|
113
|
-
if custom = @column_names.fetch(column_name, nil)
|
114
|
-
t.column(*custom)
|
115
|
-
else
|
116
|
-
t.string column_name
|
117
|
-
end
|
118
|
-
end
|
119
|
-
break
|
120
|
-
end
|
115
|
+
|
116
|
+
# Postgress COPY command using STDIN
|
117
|
+
# - reads chunks of 8192 bytes to save memory
|
118
|
+
# System command for IO subprocesses are piped to
|
119
|
+
# take advantage of multi cores
|
120
|
+
def create!
|
121
|
+
unless @stream_columns_set || @options[:headers] == false
|
122
|
+
create_stream_columns
|
121
123
|
end
|
124
|
+
connection.exec Table.new(table_name, @columns.to_sql, @options[:temporary], @options[:on_commit]).to_sql
|
125
|
+
pipe_it
|
122
126
|
end
|
123
127
|
|
124
|
-
#
|
125
|
-
|
126
|
-
|
128
|
+
# adds a serial column called agents_pkey and sets as primary key
|
129
|
+
def add_primary_key!
|
130
|
+
connection.exec "ALTER TABLE #{table_name} ADD COLUMN agents_pkey serial PRIMARY KEY;"
|
131
|
+
end
|
132
|
+
|
133
|
+
# analyzes the table for efficent query contstruction on tables larger than ~1000 tuples
|
134
|
+
def analyze!
|
135
|
+
connection.exec "ANALYZE #{table_name};"
|
136
|
+
end
|
137
|
+
|
138
|
+
# explicitly drop table
|
139
|
+
def drop!
|
140
|
+
connection.exec "DROP TABLE #{table_name};"
|
141
|
+
@table_name = nil
|
142
|
+
end
|
143
|
+
|
144
|
+
def system_command #:nodoc
|
127
145
|
unless sed_command.empty?
|
128
146
|
"cat #{@stream} | sed #{sed_command.join(" | sed ")}"
|
129
147
|
else
|
@@ -131,27 +149,17 @@ module Theman
|
|
131
149
|
end
|
132
150
|
end
|
133
151
|
|
134
|
-
|
135
|
-
|
136
|
-
def add_primary_key
|
137
|
-
instance.connection.raw_connection.query "ALTER TABLE #{instance.table_name} ADD COLUMN agents_pkey serial PRIMARY KEY;"
|
138
|
-
end
|
139
|
-
|
140
|
-
# use postgress COPY command using STDIN with CSV HEADER
|
141
|
-
# reads chunks of 8192 bytes to save memory
|
142
|
-
def pipe_it(l = "")
|
143
|
-
raise "table does not exist" unless instance.table_exists?
|
144
|
-
raw = instance.connection.raw_connection
|
145
|
-
raw.query psql_command.join("; ")
|
152
|
+
def pipe_it(l = "") #:nodoc
|
153
|
+
connection.exec psql_command.join("; ")
|
146
154
|
f = IO.popen(system_command)
|
147
155
|
begin
|
148
156
|
while f.read(8192, l)
|
149
|
-
|
157
|
+
connection.put_copy_data l
|
150
158
|
end
|
151
159
|
rescue EOFError
|
152
160
|
f.close
|
153
161
|
end
|
154
|
-
|
162
|
+
connection.put_copy_end
|
155
163
|
end
|
156
164
|
end
|
157
165
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Theman
|
2
|
+
class Object
|
3
|
+
def self.new(table_name, parent = ::Object, conn = nil)
|
4
|
+
Class.new(parent) do
|
5
|
+
unless conn.nil?
|
6
|
+
@@connection = conn
|
7
|
+
end
|
8
|
+
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
9
|
+
set_table_name "#{table_name}"
|
10
|
+
|
11
|
+
def table_name
|
12
|
+
"#{table_name}"
|
13
|
+
end
|
14
|
+
|
15
|
+
def inspect
|
16
|
+
"Agent (#{table_name})"
|
17
|
+
end
|
18
|
+
EOV
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/theman/version.rb
CHANGED
data/lib/theman.rb
CHANGED