theman 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +60 -54
- data/lib/theman/agency.rb +157 -0
- data/lib/theman/version.rb +1 -1
- data/lib/theman.rb +1 -1
- data/spec/fixtures/temp_six.txt +5 -0
- data/spec/theman_spec.rb +45 -2
- metadata +5 -4
- data/lib/theman/themans_agency.rb +0 -133
data/README.rdoc
CHANGED
@@ -2,109 +2,115 @@
|
|
2
2
|
|
3
3
|
The man getting you down?
|
4
4
|
|
5
|
-
FasterCSV is great
|
6
|
-
|
7
|
-
|
5
|
+
FasterCSV is great but when you get to 100MB files it takes too long and
|
6
|
+
you may only be looking for certain records that match some criteria,
|
7
|
+
enter Theman.
|
8
8
|
|
9
9
|
== Installation
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
gem 'theman'
|
11
|
+
gem install 'theman'
|
14
12
|
|
15
|
-
|
13
|
+
== Basic Usage
|
16
14
|
|
17
|
-
|
15
|
+
my_agent = ::Theman::Agency.new 'pretty.csv'
|
16
|
+
my_agent.instance.count
|
18
17
|
|
19
|
-
|
18
|
+
Say if you wanted to analyse some data the man has given you, you could make the
|
19
|
+
table persist by doing the following in an IRB session:
|
20
20
|
|
21
|
-
|
21
|
+
require 'active_record'
|
22
|
+
require 'theman'
|
22
23
|
|
23
|
-
|
24
|
+
ActiveRecord::Base.establish_connection :database => $USER, :host => "localhost", :adapter => "postgresql"
|
24
25
|
|
25
|
-
|
26
|
+
my_agent = ::Theman::Agency.new('pretty.csv', ActiveRecord::Base, :temporary => false) {|a| a.date :date_col }
|
26
27
|
|
27
|
-
|
28
|
-
|
29
|
-
temp_model.count
|
28
|
+
After you have figured out what the hell is going on with the data, you can get the man of
|
29
|
+
your back (for now).
|
30
30
|
|
31
31
|
== Advanced Usage
|
32
32
|
|
33
|
-
my_agent = ::Theman::Agency.new 'ugly.csv' do |
|
34
|
-
|
35
|
-
|
36
|
-
|
33
|
+
my_agent = ::Theman::Agency.new 'ugly.csv', ActiveRecord::Base, :primary_key => true do |s|
|
34
|
+
s.nulls /"N"/, /"UNKNOWN"/, /""/
|
35
|
+
s.seds "-n -e :a -e '1,15!{P;N;D;};N;ba'"
|
36
|
+
s.delimiter "|"
|
37
|
+
s.table do |t|
|
38
|
+
t.string :name, :limit => 50
|
37
39
|
t.date :date
|
38
40
|
t.integer :ext_id
|
39
41
|
t.float :amount
|
40
42
|
t.boolean :exited
|
41
43
|
end
|
42
44
|
end
|
43
|
-
temp_model = my_agent.instance
|
44
|
-
temp_model.where(:exited => true).count
|
45
45
|
|
46
|
-
|
46
|
+
my_agent.instance.where(:exited => true).count
|
47
|
+
|
48
|
+
In the above example we omitted the last 15 rows, made some things null,
|
49
|
+
added a primary key and changed some column data types to something else.
|
47
50
|
|
48
|
-
If you do not provide a table block your columns will be VARCHAR(255)
|
49
|
-
can cherry pick
|
51
|
+
If you do not provide a table block your columns will be VARCHAR(255); you
|
52
|
+
can cherry pick the columns that you want to change the data types for.
|
50
53
|
|
51
|
-
The temp table has no id column but you
|
54
|
+
The temp table has no id column by default, but you can add one using the options
|
55
|
+
hash or calling add_primary_key, this will add the agents_pkey column.
|
52
56
|
|
53
|
-
If you want to call this procedural just don't pass in the path to the file
|
54
|
-
and Theman will not create a table in
|
55
|
-
|
57
|
+
If you want to call this procedural style just don't pass in the path to the file
|
58
|
+
and Theman will not create a table, in this case you will need to call everything
|
59
|
+
explicitly:
|
56
60
|
|
57
61
|
smith = ::Theman::Agency.new
|
58
62
|
smith.stream 'real_ugly.csv'
|
59
|
-
smith.seds "-n -e :a -e '1,15!{P;N;D;};N;ba'"
|
60
|
-
smith.nulls /"XXXX"/
|
61
|
-
smith.date :date
|
62
|
-
|
63
63
|
smith.create_table
|
64
64
|
smith.pipe_it
|
65
65
|
|
66
|
+
WARNING: if you have user input in your sed commands, don't.
|
67
|
+
|
68
|
+
NOTE: When passing in extra sed commands each one will be given it's own subprocess,
|
69
|
+
this makes it much faster if you have more than one core.
|
70
|
+
|
66
71
|
== Dates
|
67
72
|
|
68
|
-
Ah dates,
|
73
|
+
Ah dates, the joy! Use datestyle to tell Theman to then tell PostgreSQL:
|
69
74
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
+
my_agent = ::Theman::Agency.new 'uber_foie_gras.csv' do |schmit|
|
76
|
+
schmit.datestyle 'European'
|
77
|
+
schmit.table do |t|
|
78
|
+
t.date :start_date
|
79
|
+
t.date :end_date
|
75
80
|
end
|
81
|
+
end
|
76
82
|
|
77
|
-
Refer to
|
83
|
+
Refer to PostgreSQL docs for more info in the mean time here is some
|
84
|
+
copy and paste action:
|
78
85
|
|
79
|
-
ISO
|
86
|
+
=== ISO
|
80
87
|
|
81
|
-
|
88
|
+
Use ISO 8601-style dates and times (YYYY-MM-DD HH:MM:SS). This is the default.
|
82
89
|
|
83
|
-
SQL
|
90
|
+
=== SQL
|
84
91
|
|
85
|
-
|
86
|
-
(which mandates ISO 8601 style), the naming of this option is a historical accident.
|
92
|
+
Use Oracle/Ingres-style dates and times.
|
87
93
|
|
88
|
-
PostgreSQL
|
94
|
+
=== PostgreSQL
|
89
95
|
|
90
|
-
|
96
|
+
Use traditional PostgreSQL format.
|
91
97
|
|
92
|
-
German
|
98
|
+
=== German
|
93
99
|
|
94
|
-
|
100
|
+
dd.mm.yyyy
|
95
101
|
|
96
|
-
European
|
102
|
+
=== European
|
97
103
|
|
98
|
-
|
104
|
+
dd/mm/yyyy
|
99
105
|
|
100
|
-
US
|
106
|
+
=== US
|
101
107
|
|
102
|
-
|
108
|
+
mm/dd/yyyy
|
103
109
|
|
104
110
|
== Troubles
|
105
111
|
|
106
|
-
Table empty? the man (the real life one) has given you crappy data and
|
107
|
-
has silently dissed it.
|
112
|
+
Table empty? the man (the real life one) has given you crappy data and
|
113
|
+
PostgresSQL has silently dissed it.
|
108
114
|
|
109
115
|
== Copyright
|
110
116
|
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Theman
|
2
|
+
class Agency
|
3
|
+
attr_reader :instance, :column_names
|
4
|
+
|
5
|
+
def initialize(stream = nil, parent = ::ActiveRecord::Base, options = {})
|
6
|
+
@options = options
|
7
|
+
@stream = stream
|
8
|
+
|
9
|
+
agent_id = sprintf "agent%010d", rand(100000000)
|
10
|
+
@column_names = {}
|
11
|
+
@instance = Class.new(parent) do
|
12
|
+
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
13
|
+
set_table_name "#{agent_id}"
|
14
|
+
def table_name
|
15
|
+
"#{agent_id}"
|
16
|
+
end
|
17
|
+
def inspect
|
18
|
+
"Agent (#{agent_id})"
|
19
|
+
end
|
20
|
+
EOV
|
21
|
+
end
|
22
|
+
|
23
|
+
yield self if block_given?
|
24
|
+
return unless stream
|
25
|
+
create_table
|
26
|
+
pipe_it
|
27
|
+
if @options[:primary_key]
|
28
|
+
add_primary_key
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def table
|
33
|
+
yield self if block_given?
|
34
|
+
end
|
35
|
+
|
36
|
+
# columnn data type methods
|
37
|
+
%w( string text integer float decimal datetime timestamp time date binary boolean ).each do |column_type|
|
38
|
+
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
39
|
+
def #{column_type}(column_name, *args)
|
40
|
+
column(column_name, '#{column_type}', *args)
|
41
|
+
end
|
42
|
+
EOV
|
43
|
+
end
|
44
|
+
|
45
|
+
# overides the default string type column
|
46
|
+
def column(column_name, column_type, *args)
|
47
|
+
@column_names.merge! column_name.to_sym => [column_name, column_type, *args]
|
48
|
+
end
|
49
|
+
|
50
|
+
def stream(arg)
|
51
|
+
@stream = arg
|
52
|
+
end
|
53
|
+
|
54
|
+
def datestyle(arg)
|
55
|
+
@datestyle = arg
|
56
|
+
end
|
57
|
+
|
58
|
+
def nulls(*args)
|
59
|
+
@nulls = args
|
60
|
+
end
|
61
|
+
|
62
|
+
def seds(*args)
|
63
|
+
@seds = args
|
64
|
+
end
|
65
|
+
|
66
|
+
def delimiter(arg)
|
67
|
+
@delimiter = arg
|
68
|
+
end
|
69
|
+
|
70
|
+
def symbolize(name)
|
71
|
+
name.gsub(/ /,"_").gsub(/\W/, "").downcase.to_sym
|
72
|
+
end
|
73
|
+
|
74
|
+
def psql_copy(psql = [])
|
75
|
+
psql << "COPY #{@instance.table_name} FROM STDIN WITH"
|
76
|
+
psql << "DELIMITER '#{@delimiter}'" unless @delimiter.nil?
|
77
|
+
psql << "CSV HEADER"
|
78
|
+
psql
|
79
|
+
end
|
80
|
+
|
81
|
+
def psql_command(psql = [])
|
82
|
+
psql << "SET DATESTYLE TO #{@datestyle}" unless @datestyle.nil?
|
83
|
+
psql << psql_copy.join(" ")
|
84
|
+
psql
|
85
|
+
end
|
86
|
+
|
87
|
+
def sed_command(sed = [])
|
88
|
+
sed << nulls_to_sed unless @nulls.nil?
|
89
|
+
sed << @seds unless @seds.nil?
|
90
|
+
sed
|
91
|
+
end
|
92
|
+
|
93
|
+
def nulls_to_sed
|
94
|
+
@nulls.map do |regex|
|
95
|
+
"-e 's/#{regex.source}//g'"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# creates a delimiter regular expresion
|
100
|
+
def delimiter_regexp
|
101
|
+
Regexp.new(@delimiter.nil? ? "," : "\\#{@delimiter}")
|
102
|
+
end
|
103
|
+
|
104
|
+
# read the first line from the stream to create a table with
|
105
|
+
def create_table
|
106
|
+
f = File.open(@stream, 'r')
|
107
|
+
options = {:id => false}
|
108
|
+
options.merge!(:temporary => true) if @options[:temporary].nil?
|
109
|
+
instance.connection.create_table(instance.table_name, options) do |t|
|
110
|
+
f.each_line do |line|
|
111
|
+
line.split(delimiter_regexp).each do |col|
|
112
|
+
column_name = symbolize(col)
|
113
|
+
if custom = @column_names.fetch(column_name, nil)
|
114
|
+
t.column(*custom)
|
115
|
+
else
|
116
|
+
t.string column_name
|
117
|
+
end
|
118
|
+
end
|
119
|
+
break
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
# system command for IO subprocesses, commands are piped to
|
125
|
+
# take advantage of multi cores
|
126
|
+
def system_command
|
127
|
+
unless sed_command.empty?
|
128
|
+
"cat #{@stream} | sed #{sed_command.join(" | sed ")}"
|
129
|
+
else
|
130
|
+
"cat #{@stream}"
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# addition of a primary key after the data has been piped to
|
135
|
+
# the table
|
136
|
+
def add_primary_key
|
137
|
+
instance.connection.raw_connection.query "ALTER TABLE #{instance.table_name} ADD COLUMN agents_pkey serial PRIMARY KEY;"
|
138
|
+
end
|
139
|
+
|
140
|
+
# use postgress COPY command using STDIN with CSV HEADER
|
141
|
+
# reads chunks of 8192 bytes to save memory
|
142
|
+
def pipe_it(l = "")
|
143
|
+
raise "table does not exist" unless instance.table_exists?
|
144
|
+
raw = instance.connection.raw_connection
|
145
|
+
raw.query psql_command.join("; ")
|
146
|
+
f = IO.popen(system_command)
|
147
|
+
begin
|
148
|
+
while f.read(8192, l)
|
149
|
+
raw.put_copy_data l
|
150
|
+
end
|
151
|
+
rescue EOFError
|
152
|
+
f.close
|
153
|
+
end
|
154
|
+
raw.put_copy_end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
data/lib/theman/version.rb
CHANGED
data/lib/theman.rb
CHANGED
data/spec/theman_spec.rb
CHANGED
@@ -14,7 +14,7 @@ describe Theman::Agency, "instance object" do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
it "should have a table name" do
|
17
|
-
@instance.table_name.should match /
|
17
|
+
@instance.table_name.should match /agent[0-9]{10}/
|
18
18
|
end
|
19
19
|
|
20
20
|
it "should have an ispect method" do
|
@@ -81,7 +81,7 @@ describe Theman::Agency, "data types" do
|
|
81
81
|
end
|
82
82
|
|
83
83
|
it "should have an array of nulls" do
|
84
|
-
@agent.
|
84
|
+
@agent.nulls_to_sed.should == ["-e 's/\"N\"//g'", "-e 's/\"UNKNOWN\"//g'", "-e 's/\"\"//g'"]
|
85
85
|
end
|
86
86
|
|
87
87
|
it "should have nulls not strings" do
|
@@ -167,3 +167,46 @@ describe Theman::Agency, "procedural" do
|
|
167
167
|
my_model.count.should == 5
|
168
168
|
end
|
169
169
|
end
|
170
|
+
|
171
|
+
describe Theman::Agency, "create table" do
|
172
|
+
before do
|
173
|
+
@csv = File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec', 'fixtures', 'temp_one.csv'))
|
174
|
+
@agent = ::Theman::Agency.new @csv do |agent|
|
175
|
+
agent.nulls /"N"/, /"UNKNOWN"/, /""/
|
176
|
+
agent.table do |t|
|
177
|
+
t.string :col_two, :limit => 50
|
178
|
+
end
|
179
|
+
end
|
180
|
+
@instance = @agent.instance
|
181
|
+
end
|
182
|
+
|
183
|
+
it "should have" do
|
184
|
+
@instance.first.col_two.should == "some \\text\\"
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
describe Theman::Agency, "add primary key" do
|
189
|
+
before do
|
190
|
+
@csv = File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec', 'fixtures', 'temp_one.csv'))
|
191
|
+
@agent = ::Theman::Agency.new @csv, ActiveRecord::Base, :primary_key => true
|
192
|
+
@instance = @agent.instance
|
193
|
+
end
|
194
|
+
|
195
|
+
it "should have serial primary key" do
|
196
|
+
@instance.first.agents_pkey.should == 1
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
describe Theman::Agency, "delimiters" do
|
201
|
+
before do
|
202
|
+
@csv = File.expand_path(File.join(File.dirname(__FILE__), '..', 'spec', 'fixtures', 'temp_six.txt'))
|
203
|
+
@agent = ::Theman::Agency.new @csv do |agent|
|
204
|
+
agent.delimiter "|"
|
205
|
+
end
|
206
|
+
@instance = @agent.instance
|
207
|
+
end
|
208
|
+
|
209
|
+
it "should have imported pipe delimited txt file" do
|
210
|
+
@instance.count.should == 4
|
211
|
+
end
|
212
|
+
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 5
|
9
|
+
version: 0.0.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Rufus Post
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-11-08 00:00:00 +11:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -116,11 +116,12 @@ files:
|
|
116
116
|
- README.rdoc
|
117
117
|
- Rakefile
|
118
118
|
- lib/theman.rb
|
119
|
-
- lib/theman/
|
119
|
+
- lib/theman/agency.rb
|
120
120
|
- lib/theman/version.rb
|
121
121
|
- spec/fixtures/temp_five.csv
|
122
122
|
- spec/fixtures/temp_four.csv
|
123
123
|
- spec/fixtures/temp_one.csv
|
124
|
+
- spec/fixtures/temp_six.txt
|
124
125
|
- spec/fixtures/temp_three.csv
|
125
126
|
- spec/fixtures/temp_two.csv
|
126
127
|
- spec/spec_helper.rb
|
@@ -1,133 +0,0 @@
|
|
1
|
-
module Theman
|
2
|
-
class Agency
|
3
|
-
attr_reader :instance, :column_names, :null_replacements, :sed_commands
|
4
|
-
|
5
|
-
def initialize(stream = nil, parent = ::ActiveRecord::Base)
|
6
|
-
# source of the data
|
7
|
-
@stream = stream
|
8
|
-
|
9
|
-
# create a new class that extends an active record model
|
10
|
-
# use instance_parent(klass) if not ActiveRecord::Base
|
11
|
-
cabinet_id = "c#{10.times.map{rand(9)}.join}"
|
12
|
-
@column_names = {}
|
13
|
-
@instance = Class.new(parent) do
|
14
|
-
instance_eval <<-EOV, __FILE__, __LINE__ + 1
|
15
|
-
set_table_name "#{cabinet_id}"
|
16
|
-
def table_name
|
17
|
-
"#{cabinet_id}"
|
18
|
-
end
|
19
|
-
def inspect
|
20
|
-
"Agent (#{cabinet_id})"
|
21
|
-
end
|
22
|
-
EOV
|
23
|
-
end
|
24
|
-
|
25
|
-
# if stream given table will be created
|
26
|
-
# other wise create_table and pipe_it will need to called
|
27
|
-
# proceduraly
|
28
|
-
if stream
|
29
|
-
if block_given?
|
30
|
-
yield self
|
31
|
-
end
|
32
|
-
create_table
|
33
|
-
pipe_it
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
def table
|
38
|
-
yield self if block_given?
|
39
|
-
end
|
40
|
-
|
41
|
-
# overide ActiveRecord column types to be used in a block
|
42
|
-
%w( string text integer float decimal datetime timestamp time date binary boolean ).each do |column_type|
|
43
|
-
class_eval <<-EOV, __FILE__, __LINE__ + 1
|
44
|
-
def #{column_type}(*args)
|
45
|
-
column(args[0], '#{column_type}', args[1].nil? ? {} : args[1])
|
46
|
-
end
|
47
|
-
EOV
|
48
|
-
end
|
49
|
-
|
50
|
-
# overides the default string type column
|
51
|
-
def column(name, type, options)
|
52
|
-
@column_names.merge! name.to_sym => [name, type, options]
|
53
|
-
end
|
54
|
-
|
55
|
-
def create_table
|
56
|
-
f = File.open(@stream, 'r')
|
57
|
-
instance.connection.create_table(instance.table_name, :temporary => true, :id => false) do |t|
|
58
|
-
f.each_line do |line|
|
59
|
-
line.split(/,/).each do |col|
|
60
|
-
column_name = symbolize(col)
|
61
|
-
if custom = @column_names.fetch(column_name, nil)
|
62
|
-
t.column(*custom)
|
63
|
-
else
|
64
|
-
t.string column_name
|
65
|
-
end
|
66
|
-
end
|
67
|
-
break
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def stream(path)
|
73
|
-
@stream = path
|
74
|
-
end
|
75
|
-
|
76
|
-
def datestyle(local)
|
77
|
-
@psql_datestyle = local
|
78
|
-
end
|
79
|
-
|
80
|
-
def psql_command
|
81
|
-
psql = []
|
82
|
-
psql << "SET DATESTYLE TO #{@psql_datestyle}" unless @psql_datestyle.nil?
|
83
|
-
psql << "COPY #{instance.table_name} FROM STDIN WITH CSV HEADER"
|
84
|
-
psql.join("; ")
|
85
|
-
end
|
86
|
-
|
87
|
-
# use postgress COPY command using STDIN with CSV HEADER
|
88
|
-
# reads chunks of 8192 bytes to save memory
|
89
|
-
def pipe_it(l = "")
|
90
|
-
raw = instance.connection.raw_connection
|
91
|
-
raw.query psql_command
|
92
|
-
command = "cat #{@stream} #{seds_join}"
|
93
|
-
f = IO.popen(command)
|
94
|
-
begin
|
95
|
-
while f.read(8192, l)
|
96
|
-
raw.put_copy_data l
|
97
|
-
end
|
98
|
-
rescue EOFError
|
99
|
-
f.close
|
100
|
-
end
|
101
|
-
raw.put_copy_end
|
102
|
-
end
|
103
|
-
|
104
|
-
def nulls(*args)
|
105
|
-
@null_replacements = args
|
106
|
-
end
|
107
|
-
|
108
|
-
def seds(*args)
|
109
|
-
@sed_commands = args
|
110
|
-
end
|
111
|
-
|
112
|
-
def symbolize(name)
|
113
|
-
name.gsub(/ /,"_").gsub(/\W/, "").downcase.to_sym
|
114
|
-
end
|
115
|
-
|
116
|
-
# join together the sed commands to apply to stream
|
117
|
-
def seds_join(commands = [])
|
118
|
-
unless null_replacements.nil?
|
119
|
-
commands << "| sed #{nulls_to_sed.join(" ")}"
|
120
|
-
end
|
121
|
-
unless sed_commands.nil?
|
122
|
-
commands << "| sed #{sed_commands.join("| sed ")}"
|
123
|
-
end
|
124
|
-
commands.join(" ")
|
125
|
-
end
|
126
|
-
|
127
|
-
def nulls_to_sed
|
128
|
-
@null_replacements.map do |null|
|
129
|
-
"-e 's/#{null.source}//g'"
|
130
|
-
end
|
131
|
-
end
|
132
|
-
end
|
133
|
-
end
|