theman 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +60 -54
- data/lib/theman/agency.rb +157 -0
- data/lib/theman/version.rb +1 -1
- data/lib/theman.rb +1 -1
- data/spec/fixtures/temp_six.txt +5 -0
- data/spec/theman_spec.rb +45 -2
- metadata +5 -4
- data/lib/theman/themans_agency.rb +0 -133
data/README.rdoc
CHANGED
@@ -2,109 +2,115 @@
|
|
2
2
|
|
3
3
|
The man getting you down?
|
4
4
|
|
5
|
-
FasterCSV is great
|
6
|
-
|
7
|
-
|
5
|
+
FasterCSV is great but when you get to 100MB files it takes too long and
|
6
|
+
you may only be looking for certain records that match some criteria,
|
7
|
+
enter Theman.
|
8
8
|
|
9
9
|
== Installation
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
gem 'theman'
|
11
|
+
gem install 'theman'
|
14
12
|
|
15
|
-
|
13
|
+
== Basic Usage
|
16
14
|
|
17
|
-
|
15
|
+
my_agent = ::Theman::Agency.new 'pretty.csv'
|
16
|
+
my_agent.instance.count
|
18
17
|
|
19
|
-
|
18
|
+
Say if you wanted to analyse some data the man has given you, you could make the
|
19
|
+
table persist by doing the following in an IRB session:
|
20
20
|
|
21
|
-
|
21
|
+
require 'active_record'
|
22
|
+
require 'theman'
|
22
23
|
|
23
|
-
|
24
|
+
ActiveRecord::Base.establish_connection :database => $USER, :host => "localhost", :adapter => "postgresql"
|
24
25
|
|
25
|
-
|
26
|
+
my_agent = ::Theman::Agency.new('pretty.csv', ActiveRecord::Base, :temporary => false) {|a| a.date :date_col }
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
temp_model.count
|
28
|
+
After you have figured out what the hell is going on with the data, you can get the man of
|
29
|
+
your back (for now).
|
30
30
|
|
31
31
|
== Advanced Usage
|
32
32
|
|
33
|
-
my_agent = ::Theman::Agency.new 'ugly.csv' do |
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
my_agent = ::Theman::Agency.new 'ugly.csv', ActiveRecord::Base, :primary_key => true do |s|
|
34
|
+
s.nulls /"N"/, /"UNKNOWN"/, /""/
|
35
|
+
s.seds "-n -e :a -e '1,15!{P;N;D;};N;ba'"
|
36
|
+
s.delimiter "|"
|
37
|
+
s.table do |t|
|
38
|
+
t.string :name, :limit => 50
|
37
39
|
t.date :date
|
38
40
|
t.integer :ext_id
|
39
41
|
t.float :amount
|
40
42
|
t.boolean :exited
|
41
43
|
end
|
42
44
|
end
|
43
|
-
temp_model = my_agent.instance
|
44
|
-
temp_model.where(:exited => true).count
|
45
45
|
|
46
|
-
|
46
|
+
my_agent.instance.where(:exited => true).count
|
47
|
+
|
48
|
+
In the above example we omitted the last 15 rows, made some things null,
|
49
|
+
added a primary key and changed some column data types to something else.
|
47
50
|
|
48
|
-
If you do not provide a table block your columns will be VARCHAR(255)
|
49
|
-
can cherry pick
|
51
|
+
If you do not provide a table block your columns will be VARCHAR(255); you
|
52
|
+
can cherry pick the columns that you want to change the data types for.
|
50
53
|
|
51
|
-
The temp table has no id column but you
|
54
|
+
The temp table has no id column by default, but you can add one using the options
|
55
|
+
hash or calling add_primary_key, this will add the agents_pkey column.
|
52
56
|
|
53
|
-
If you want to call this procedural just don't pass in the path to the file
|
54
|
-
and Theman will not create a table in
|
55
|
-
|
57
|
+
If you want to call this procedural style just don't pass in the path to the file
|
58
|
+
and Theman will not create a table, in this case you will need to call everything
|
59
|
+
explicitly:
|
56
60
|
|
57
61
|
smith = ::Theman::Agency.new
|
58
62
|
smith.stream 'real_ugly.csv'
|
59
|
-
smith.seds "-n -e :a -e '1,15!{P;N;D;};N;ba'"
|
60
|
-
smith.nulls /"XXXX"/
|
61
|
-
smith.date :date
|
62
|
-
|
63
63
|
smith.create_table
|
64
64
|
smith.pipe_it
|
65
65
|
|
66
|
+
WARNING: if you have user input in your sed commands, don't.
|
67
|
+
|
68
|
+
NOTE: When passing in extra sed commands each one will be given it's own subprocess,
|
69
|
+
this makes it much faster if you have more than one core.
|
70
|
+
|
66
71
|
== Dates
|
67
72
|
|
68
|
-
Ah dates,
|
73
|
+
Ah dates, the joy! Use datestyle to tell Theman to then tell PostgreSQL:
|
69
74
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
+
my_agent = ::Theman::Agency.new 'uber_foie_gras.csv' do |schmit|
|
76
|
+
schmit.datestyle 'European'
|
77
|
+
schmit.table do |t|
|
78
|
+
t.date :start_date
|
79
|
+
t.date :end_date
|
75
80
|
end
|
81
|
+
end
|
76
82
|
|
77
|
-
Refer to
|
83
|
+
Refer to PostgreSQL docs for more info in the mean time here is some
|
84
|
+
copy and paste action:
|
78
85
|
|
79
|
-
ISO
|
86
|
+
=== ISO
|
80
87
|
|
81
|
-
|
88
|
+
Use ISO 8601-style dates and times (YYYY-MM-DD HH:MM:SS). This is the default.
|
82
89
|
|
83
|
-
SQL
|
90
|
+
=== SQL
|
84
91
|
|
85
|
-
|
86
|
-
(which mandates ISO 8601 style), the naming of this option is a historical accident.
|
92
|
+
Use Oracle/Ingres-style dates and times.
|
87
93
|
|
88
|
-
PostgreSQL
|
94
|
+
=== PostgreSQL
|
89
95
|
|
90
|
-
|
96
|
+
Use traditional PostgreSQL format.
|
91
97
|
|
92
|
-
German
|
98
|
+
=== German
|
93
99
|
|
94
|
-
|
100
|
+
dd.mm.yyyy
|
95
101
|
|
96
|
-
European
|
102
|
+
=== European
|
97
103
|
|
98
|
-
|
104
|
+
dd/mm/yyyy
|
99
105
|
|
100
|
-
US
|
106
|
+
=== US
|
101
107
|
|
102
|
-
|
108
|
+
mm/dd/yyyy
|
103
109
|
|
104
110
|
== Troubles
|
105
111
|
|
106
|
-
Table empty? the man (the real life one) has given you crappy data and
|
107
|
-
has silently dissed it.
|
112
|
+
Table empty? the man (the real life one) has given you crappy data and
|
113
|
+
PostgresSQL has silently dissed it.
|
108
114
|
|
109
115
|
== Copyright
|
110
116
|
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Theman
|
2
|
+
class Agency
|
3
|
+
attr_reader :instance, :column_names
|
4
|
+
|
5
|
+
def initialize(stream = nil, parent = ::ActiveRecord::Base, options = {})
|
6
|
+
@options = options
|
7
|
+
@stream = stream
|
8
|
+
|
9
|
+
agent_id = sprintf "agent%010d", rand(100000000)
|
10
|
+
@column_names = {}
|
11
|
+
@instance = Class.new(parent) do
|
12
|
+
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
13
|
+
set_table_name "#{agent_id}"
|
14
|
+
def table_name
|
15
|
+
"#{agent_id}"
|
16
|
+
end
|
17
|
+
def inspect
|
18
|
+
"Agent (#{agent_id})"
|
19
|
+
end
|
20
|
+
EOV
|
21
|
+
end
|
22
|
+
|
23
|
+
yield self if block_given?
|
24
|
+
return unless stream
|
25
|
+
create_table
|
26
|
+
pipe_it
|
27
|
+
if @options[:primary_key]
|
28
|
+
add_primary_key
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def table
|
33
|
+
yield self if block_given?
|
34
|
+
end
|
35
|
+
|
36
|
+
# columnn data type methods
|
37
|
+
%w( string text integer float decimal datetime timestamp time date binary boolean ).each do |column_type|
|
38
|
+
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
39
|
+
def #{column_type}(column_name, *args)
|
40
|
+
column(column_name, '#{column_type}', *args)
|
41
|
+
end
|
42
|
+
EOV
|
43
|
+
end
|
44
|
+
|
45
|
+
# overides the default string type column
|
46
|
+
def column(column_name, column_type, *args)
|
47
|
+
@column_names.merge! column_name.to_sym => [column_name, column_type, *args]
|
48
|
+
end
|
49
|
+
|
50
|
+
def stream(arg)
|
51
|
+
@stream = arg
|
52
|
+
end
|
53
|
+
|
54
|
+
def datestyle(arg)
|
55
|
+
@datestyle = arg
|
56
|
+
end
|
57
|
+
|
58
|
+
def nulls(*args)
|
59
|
+
@nulls = args
|
60
|
+
end
|
61
|
+
|
62
|
+
def seds(*args)
|
63
|
+
@seds = args
|
64
|
+
end
|
65
|
+
|
66
|
+
def delimiter(arg)
|
67
|
+
@delimiter = arg
|
68
|
+
end
|
69
|
+
|
70
|
+
def symbolize(name)
|
71
|
+
name.gsub(/ /,"_").gsub(/\W/, "").downcase.to_sym
|
72
|
+
end
|
73
|
+
|
74
|
+
def psql_copy(psql = [])
|
75
|
+
psql << "COPY #{@instance.table_name} FROM STDIN WITH"
|
76
|
+
psql << "DELIMITER '#{@delimiter}'" unless @delimiter.nil?
|
77
|
+
psql << "CSV HEADER"
|
78
|
+
psql
|
79
|
+
end
|
80
|
+
|
81
|
+
def psql_command(psql = [])
|
82
|
+
psql << "SET DATESTYLE TO #{@datestyle}" unless @datestyle.nil?
|
83
|
+
psql << psql_copy.join(" ")
|
84
|
+
psql
|
85
|
+
end
|
86
|
+
|
87
|
+
def sed_command(sed = [])
|
88
|
+
sed << nulls_to_sed unless @nulls.nil?
|
89
|
+
sed << @seds unless @seds.nil?
|
90
|
+
sed
|
91
|
+
end
|
92
|
+
|
93
|
+
def nulls_to_sed
|
94
|
+
@nulls.map do |regex|
|
95
|
+
"-e 's/#{regex.source}//g'"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# creates a delimiter regular expresion
|
100
|
+
def delimiter_regexp
|
101
|
+
Regexp.new(@delimiter.nil? ? "," : "\\#{@delimiter}")
|
102
|
+
end
|
103
|
+
|
104
|
+
# read the first line from the stream to create a table with
|
105
|
+
def create_table
|
106
|
+
f = File.open(@stream, 'r')
|
107
|
+
options = {:id => false}
|
108
|
+
options.merge!(:temporary => true) if @options[:temporary].nil?
|
109
|
+
instance.connection.create_table(instance.table_name, options) do |t|
|
110
|
+
f.each_line do |line|
|
111
|
+
line.split(delimiter_regexp).each do |col|
|
112
|
+
column_name = symbolize(col)
|
113
|
+
if custom = @column_names.fetch(column_name, nil)
|
114
|
+
t.column(*custom)
|
115
|
+
else
|
116
|
+
t.string column_name
|
117
|
+
end
|
118
|
+
end
|
119
|
+
break
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# system command for IO subprocesses, commands are piped to
|
125
|
+
# take advantage of multi cores
|
126
|
+
def system_command
|
127
|
+
unless sed_command.empty?
|
128
|
+
"cat #{@stream} | sed #{sed_command.join(" | sed ")}"
|
129
|
+
else
|
130
|
+
"cat #{@stream}"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# addition of a primary key after the data has been piped to
|
135
|
+
# the table
|
136
|
+
def add_primary_key
|
137
|
+
instance.connection.raw_connection.query "ALTER TABLE #{instance.table_name} ADD COLUMN agents_pkey serial PRIMARY KEY;"
|
138
|
+
end
|
139
|
+
|
140
|
+
# use postgress COPY command using STDIN with CSV HEADER
|
141
|
+
# reads chunks of 8192 bytes to save memory
|
142
|
+
def pipe_it(l = "")
|
143
|
+
raise "table does not exist" unless instance.table_exists?
|
144
|
+
raw = instance.connection.raw_connection
|
145
|
+
raw.query psql_command.join("; ")
|
146
|
+
f = IO.popen(system_command)
|
147
|
+
begin
|
148
|
+
while f.read(8192, l)
|
149
|
+
raw.put_copy_data l
|
150
|
+
end
|
151
|
+
rescue EOFError
|
152
|
+
f.close
|
153
|
+
end
|
154
|
+
raw.put_copy_end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
data/lib/theman/version.rb
CHANGED
data/lib/theman.rb
CHANGED
data/spec/theman_spec.rb
CHANGED
@@ -14,7 +14,7 @@ describe Theman::Agency, "instance object" do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
it "should have a table name" do
|
17
|
-
@instance.table_name.should match /
|
17
|
+
@instance.table_name.should match /agent[0-9]{10}/
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should have an ispect method" do
|
@@ -81,7 +81,7 @@ describe Theman::Agency, "data types" do
|
|
81
81
|
end
|
82
82
|
|
83
83
|
it "should have an array of nulls" do
|
84
|
-
@agent.
|
84
|
+
@agent.nulls_to_sed.should == ["-e 's/\"N\"//g'", "-e 's/\"UNKNOWN\"//g'", "-e 's/\"\"//g'"]
|
85
85
|
end
|
86
86
|
|
87
87
|
it "should have nulls not strings" do
|
@@ -167,3 +167,46 @@ describe Theman::Agency, "procedural" do
|
|
167
167
|
my_model.count.should == 5
|
168
168
|
end
|
169
169
|
end
|
170
|
+
|
171
|
+
describe Theman::Agency, "create table" do
|
172
|
+
before do
|
173
|
+
@csv = File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec', 'fixtures', 'temp_one.csv'))
|
174
|
+
@agent = ::Theman::Agency.new @csv do |agent|
|
175
|
+
agent.nulls /"N"/, /"UNKNOWN"/, /""/
|
176
|
+
agent.table do |t|
|
177
|
+
t.string :col_two, :limit => 50
|
178
|
+
end
|
179
|
+
end
|
180
|
+
@instance = @agent.instance
|
181
|
+
end
|
182
|
+
|
183
|
+
it "should have" do
|
184
|
+
@instance.first.col_two.should == "some \\text\\"
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
describe Theman::Agency, "add primary key" do
|
189
|
+
before do
|
190
|
+
@csv = File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec', 'fixtures', 'temp_one.csv'))
|
191
|
+
@agent = ::Theman::Agency.new @csv, ActiveRecord::Base, :primary_key => true
|
192
|
+
@instance = @agent.instance
|
193
|
+
end
|
194
|
+
|
195
|
+
it "should have serial primary key" do
|
196
|
+
@instance.first.agents_pkey.should == 1
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
describe Theman::Agency, "delimiters" do
|
201
|
+
before do
|
202
|
+
@csv = File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec', 'fixtures', 'temp_six.txt'))
|
203
|
+
@agent = ::Theman::Agency.new @csv do |agent|
|
204
|
+
agent.delimiter "|"
|
205
|
+
end
|
206
|
+
@instance = @agent.instance
|
207
|
+
end
|
208
|
+
|
209
|
+
it "should have imported pipe delimited txt file" do
|
210
|
+
@instance.count.should == 4
|
211
|
+
end
|
212
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 5
|
9
|
+
version: 0.0.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Rufus Post
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-11-08 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -116,11 +116,12 @@ files:
|
|
116
116
|
- README.rdoc
|
117
117
|
- Rakefile
|
118
118
|
- lib/theman.rb
|
119
|
-
- lib/theman/
|
119
|
+
- lib/theman/agency.rb
|
120
120
|
- lib/theman/version.rb
|
121
121
|
- spec/fixtures/temp_five.csv
|
122
122
|
- spec/fixtures/temp_four.csv
|
123
123
|
- spec/fixtures/temp_one.csv
|
124
|
+
- spec/fixtures/temp_six.txt
|
124
125
|
- spec/fixtures/temp_three.csv
|
125
126
|
- spec/fixtures/temp_two.csv
|
126
127
|
- spec/spec_helper.rb
|
@@ -1,133 +0,0 @@
|
|
1
|
-
module Theman
|
2
|
-
class Agency
|
3
|
-
attr_reader :instance, :column_names, :null_replacements, :sed_commands
|
4
|
-
|
5
|
-
def initialize(stream = nil, parent = ::ActiveRecord::Base)
|
6
|
-
# source of the data
|
7
|
-
@stream = stream
|
8
|
-
|
9
|
-
# create a new class that extends an active record model
|
10
|
-
# use instance_parent(klass) if not ActiveRecord::Base
|
11
|
-
cabinet_id = "c#{10.times.map{rand(9)}.join}"
|
12
|
-
@column_names = {}
|
13
|
-
@instance = Class.new(parent) do
|
14
|
-
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
15
|
-
set_table_name "#{cabinet_id}"
|
16
|
-
def table_name
|
17
|
-
"#{cabinet_id}"
|
18
|
-
end
|
19
|
-
def inspect
|
20
|
-
"Agent (#{cabinet_id})"
|
21
|
-
end
|
22
|
-
EOV
|
23
|
-
end
|
24
|
-
|
25
|
-
# if stream given table will be created
|
26
|
-
# other wise create_table and pipe_it will need to called
|
27
|
-
# proceduraly
|
28
|
-
if stream
|
29
|
-
if block_given?
|
30
|
-
yield self
|
31
|
-
end
|
32
|
-
create_table
|
33
|
-
pipe_it
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def table
|
38
|
-
yield self if block_given?
|
39
|
-
end
|
40
|
-
|
41
|
-
# overide ActiveRecord column types to be used in a block
|
42
|
-
%w( string text integer float decimal datetime timestamp time date binary boolean ).each do |column_type|
|
43
|
-
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
44
|
-
def #{column_type}(*args)
|
45
|
-
column(args[0], '#{column_type}', args[1].nil? ? {} : args[1])
|
46
|
-
end
|
47
|
-
EOV
|
48
|
-
end
|
49
|
-
|
50
|
-
# overides the default string type column
|
51
|
-
def column(name, type, options)
|
52
|
-
@column_names.merge! name.to_sym => [name, type, options]
|
53
|
-
end
|
54
|
-
|
55
|
-
def create_table
|
56
|
-
f = File.open(@stream, 'r')
|
57
|
-
instance.connection.create_table(instance.table_name, :temporary => true, :id => false) do |t|
|
58
|
-
f.each_line do |line|
|
59
|
-
line.split(/,/).each do |col|
|
60
|
-
column_name = symbolize(col)
|
61
|
-
if custom = @column_names.fetch(column_name, nil)
|
62
|
-
t.column(*custom)
|
63
|
-
else
|
64
|
-
t.string column_name
|
65
|
-
end
|
66
|
-
end
|
67
|
-
break
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def stream(path)
|
73
|
-
@stream = path
|
74
|
-
end
|
75
|
-
|
76
|
-
def datestyle(local)
|
77
|
-
@psql_datestyle = local
|
78
|
-
end
|
79
|
-
|
80
|
-
def psql_command
|
81
|
-
psql = []
|
82
|
-
psql << "SET DATESTYLE TO #{@psql_datestyle}" unless @psql_datestyle.nil?
|
83
|
-
psql << "COPY #{instance.table_name} FROM STDIN WITH CSV HEADER"
|
84
|
-
psql.join("; ")
|
85
|
-
end
|
86
|
-
|
87
|
-
# use postgress COPY command using STDIN with CSV HEADER
|
88
|
-
# reads chunks of 8192 bytes to save memory
|
89
|
-
def pipe_it(l = "")
|
90
|
-
raw = instance.connection.raw_connection
|
91
|
-
raw.query psql_command
|
92
|
-
command = "cat #{@stream} #{seds_join}"
|
93
|
-
f = IO.popen(command)
|
94
|
-
begin
|
95
|
-
while f.read(8192, l)
|
96
|
-
raw.put_copy_data l
|
97
|
-
end
|
98
|
-
rescue EOFError
|
99
|
-
f.close
|
100
|
-
end
|
101
|
-
raw.put_copy_end
|
102
|
-
end
|
103
|
-
|
104
|
-
def nulls(*args)
|
105
|
-
@null_replacements = args
|
106
|
-
end
|
107
|
-
|
108
|
-
def seds(*args)
|
109
|
-
@sed_commands = args
|
110
|
-
end
|
111
|
-
|
112
|
-
def symbolize(name)
|
113
|
-
name.gsub(/ /,"_").gsub(/\W/, "").downcase.to_sym
|
114
|
-
end
|
115
|
-
|
116
|
-
# join together the sed commands to apply to stream
|
117
|
-
def seds_join(commands = [])
|
118
|
-
unless null_replacements.nil?
|
119
|
-
commands << "| sed #{nulls_to_sed.join(" ")}"
|
120
|
-
end
|
121
|
-
unless sed_commands.nil?
|
122
|
-
commands << "| sed #{sed_commands.join("| sed ")}"
|
123
|
-
end
|
124
|
-
commands.join(" ")
|
125
|
-
end
|
126
|
-
|
127
|
-
def nulls_to_sed
|
128
|
-
@null_replacements.map do |null|
|
129
|
-
"-e 's/#{null.source}//g'"
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|
133
|
-
end
|