tables 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -0
- data/README +3 -0
- data/Rakefile +35 -0
- data/lib/tables.rb +14 -0
- data/lib/tables/excel_table_reader.rb +175 -0
- data/lib/tables/table.rb +260 -0
- data/lib/tables/table_reader.rb +65 -0
- data/lib/tables/word_table_reader.rb +83 -0
- data/lib/tables/word_table_writer.rb +64 -0
- data/spec/copy_spec.rb +33 -0
- data/spec/excel_table_reader_spec.rb +132 -0
- data/spec/office_copy_spec.rb +55 -0
- data/spec/table_spec.rb +237 -0
- data/spec/text_handling_spec.rb +37 -0
- data/spec/word_table_reader_spec.rb +70 -0
- data/spec/word_table_writer_spec.rb +37 -0
- metadata +64 -0
data/LICENSE
ADDED
data/README
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#
|
2
|
+
#
|
3
|
+
# Author:: Saul Caganoff (mailto:scaganoff@gmail.com)
|
4
|
+
# Copyright:: Copyright (c) 2010, Saul Caganoff
|
5
|
+
# License:: Creative Commons Attribution 3.0 Australia License (http://creativecommons.org/licenses/by/3.0/au/)
|
6
|
+
#
|
7
|
+
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'rake'
|
11
|
+
require 'rake/clean'
|
12
|
+
require 'rubygems/package_task'
|
13
|
+
require 'rdoc/task'
|
14
|
+
require 'rake/testtask'
|
15
|
+
|
16
|
+
|
17
|
+
|
18
|
+
Gem::PackageTask.new(spec) do |p|
|
19
|
+
p.gem_spec = spec
|
20
|
+
p.need_tar = true
|
21
|
+
p.need_zip = false
|
22
|
+
end
|
23
|
+
|
24
|
+
RDoc::Task.new do |rdoc|
|
25
|
+
files =['README', 'LICENSE', 'lib/**/*.rb']
|
26
|
+
rdoc.rdoc_files.add(files)
|
27
|
+
rdoc.main = "README" # page to start on
|
28
|
+
rdoc.title = "Tables Docs"
|
29
|
+
rdoc.rdoc_dir = 'doc/rdoc' # rdoc output folder
|
30
|
+
rdoc.options << '--line-numbers'
|
31
|
+
end
|
32
|
+
|
33
|
+
Rake::TestTask.new do |t|
|
34
|
+
t.test_files = FileList['test/**/*.rb']
|
35
|
+
end
|
data/lib/tables.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#
|
2
|
+
# Author:: Saul Caganoff (mailto:scaganoff@gmail.com)
|
3
|
+
# Copyright:: Copyright (c) 2010, Saul Caganoff
|
4
|
+
# License:: Creative Commons Attribution 3.0 Australia License (http://creativecommons.org/licenses/by/3.0/au/)
|
5
|
+
#
|
6
|
+
|
7
|
+
$:.unshift File.dirname(__FILE__)
|
8
|
+
|
9
|
+
require 'tables/table'
|
10
|
+
require 'tables/table_reader'
|
11
|
+
require 'tables/excel_table_reader'
|
12
|
+
require 'tables/word_table_reader'
|
13
|
+
require 'tables/word_table_writer'
|
14
|
+
|
@@ -0,0 +1,175 @@
|
|
1
|
+
#
|
2
|
+
# Author:: Saul Caganoff (mailto:scaganoff@gmail.com)
|
3
|
+
# Copyright:: Copyright (c) 2010, Saul Caganoff
|
4
|
+
# License:: Creative Commons Attribution 3.0 Australia License (http://creativecommons.org/licenses/by/3.0/au/)
|
5
|
+
#
|
6
|
+
require 'win32ole'
|
7
|
+
|
8
|
+
module Tables
|
9
|
+
|
10
|
+
class ExcelTableReader < TableReader
|
11
|
+
|
12
|
+
attr_reader :workbook, :table
|
13
|
+
|
14
|
+
def initialize(filename=nil)
|
15
|
+
@excel=WIN32OLE.new('Excel.Application')
|
16
|
+
open_file(filename) unless filename.nil?
|
17
|
+
super()
|
18
|
+
end
|
19
|
+
|
20
|
+
def open_file(filename)
|
21
|
+
path=get_file_path(filename)
|
22
|
+
@excel.Workbooks.Open(path)
|
23
|
+
puts "Open workbook '#{path}'" if $DEBUG
|
24
|
+
@workbook=@excel.Workbooks.Item(1)
|
25
|
+
@worksheets=[]
|
26
|
+
end
|
27
|
+
|
28
|
+
def create_file(filename)
|
29
|
+
path=get_file_path(filename)
|
30
|
+
@workbook=@excel.Workbooks.Add
|
31
|
+
@worksheets=[]
|
32
|
+
@workbook.SaveAs(path)
|
33
|
+
end
|
34
|
+
|
35
|
+
def extract_table(worksheet, options={})
|
36
|
+
rtf_columns=options[:rtf_columns]
|
37
|
+
rtf_columns ||= []
|
38
|
+
progress=options[:progress]
|
39
|
+
sheet=get_worksheet(worksheet)
|
40
|
+
range=sheet.UsedRange
|
41
|
+
ncols=range.Columns.Count
|
42
|
+
nrows=range.Rows.Count
|
43
|
+
|
44
|
+
result=[]
|
45
|
+
(1..nrows).each do |idx|
|
46
|
+
row=range.Rows(idx)
|
47
|
+
result << ExcelTableReader.extract_row(row,ncols,rtf_columns)
|
48
|
+
if progress and idx.modulo(50)==0 then
|
49
|
+
percent=(idx.fdiv(nrows)*100).round(0)
|
50
|
+
puts ">> extracting row #{idx} (#{percent}%)"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
Table.new(result)
|
54
|
+
end
|
55
|
+
|
56
|
+
def write_table(table, worksheet=0, options={})
|
57
|
+
progress=options[:progress]
|
58
|
+
sheet=get_worksheet(worksheet)
|
59
|
+
idx=0
|
60
|
+
nrows=table.row_count
|
61
|
+
table.each_row do |row|
|
62
|
+
idx+=1
|
63
|
+
if progress and idx.modulo(50)==0 then
|
64
|
+
percent=(idx.fdiv(nrows)*100).round(0)
|
65
|
+
puts ">> writing row #{idx} (#{percent}%)"
|
66
|
+
end
|
67
|
+
r=sheet.Rows(idx)
|
68
|
+
row.each_with_index {|val,jdx| r.Cells(jdx+1).Value=val.to_s }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def write_column(table, column_name, worksheet=0, options={})
|
73
|
+
progress=options[:progress]
|
74
|
+
sheet=get_worksheet(worksheet)
|
75
|
+
values=table.get_column(column_name)
|
76
|
+
column_index=table.colindex[column_name]+1
|
77
|
+
nrows=table.row_count
|
78
|
+
values.each_with_index do |val,idx|
|
79
|
+
if progress and idx.modulo(50)==0 then
|
80
|
+
percent=(idx.fdiv(nrows)*100).round(0)
|
81
|
+
puts ">> updating row #{idx} (#{percent}%)"
|
82
|
+
end
|
83
|
+
r=sheet.Rows(idx+1)
|
84
|
+
c=r.Cells(column_index).Value=val.to_s
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
def table_count
|
90
|
+
@workbook.WorkSheets.Count
|
91
|
+
end
|
92
|
+
|
93
|
+
def clean
|
94
|
+
@tables.each do |table|
|
95
|
+
table.remove_blank_rows!(1)
|
96
|
+
table.remove_repeat_headers!
|
97
|
+
table.demerge!
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def save
|
102
|
+
@workbook.save
|
103
|
+
end
|
104
|
+
|
105
|
+
def exit
|
106
|
+
@excel.quit
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
def get_worksheet(worksheet)
|
112
|
+
if worksheet.is_a?(Integer) then
|
113
|
+
if (worksheet+1) > @workbook.Worksheets.Count then
|
114
|
+
sheet=@workbook.Worksheets.Add
|
115
|
+
else
|
116
|
+
sheet=@workbook.Worksheets.Item(worksheet+1)
|
117
|
+
end
|
118
|
+
else
|
119
|
+
sheet=@workbook.Worksheets.Item(worksheet)
|
120
|
+
end
|
121
|
+
sheet
|
122
|
+
end
|
123
|
+
|
124
|
+
def get_file_path(filename)
|
125
|
+
fso=WIN32OLE.new('Scripting.FileSystemObject')
|
126
|
+
fso.GetAbsolutePathName(filename)
|
127
|
+
end
|
128
|
+
|
129
|
+
def ExcelTableReader.extract_row(excel_row,n,rtf_columns)
|
130
|
+
|
131
|
+
# convert zero-based rtf columns into 1-based for internal loop
|
132
|
+
rtf_cols=rtf_columns.map {|idx| idx+1 }
|
133
|
+
|
134
|
+
row=[]
|
135
|
+
(1..n).each do |idx|
|
136
|
+
if rtf_cols.include?(idx) then
|
137
|
+
row << slow_extract_text(excel_row.Cells(idx))
|
138
|
+
else
|
139
|
+
row << extract_text(excel_row.Cells(idx))
|
140
|
+
end
|
141
|
+
end
|
142
|
+
row
|
143
|
+
end
|
144
|
+
|
145
|
+
def ExcelTableReader.extract_text(range)
|
146
|
+
string=range.Text
|
147
|
+
string.sub("\a"," - ")
|
148
|
+
end
|
149
|
+
|
150
|
+
def ExcelTableReader.slow_extract_text(range)
|
151
|
+
string=""
|
152
|
+
n=range.Characters.Count
|
153
|
+
(1..n).each do |idx|
|
154
|
+
c=range.Characters(idx,1)
|
155
|
+
t=c.Text
|
156
|
+
if t=="\a" then
|
157
|
+
string+=" - "
|
158
|
+
else
|
159
|
+
string += t unless c.Font.Strikethrough
|
160
|
+
end
|
161
|
+
end
|
162
|
+
string
|
163
|
+
rescue
|
164
|
+
string=extract_text(range)
|
165
|
+
end
|
166
|
+
|
167
|
+
#def ExcelTableReader.extract_text(range)
|
168
|
+
# text = range.Text[0..-3]
|
169
|
+
# list_text = range.ListFormat.ListString
|
170
|
+
# text.empty? ? list_text : text # return list_text if text is empty
|
171
|
+
#end
|
172
|
+
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
data/lib/tables/table.rb
ADDED
@@ -0,0 +1,260 @@
|
|
1
|
+
#
|
2
|
+
# Author:: Saul Caganoff (mailto:scaganoff@gmail.com)
|
3
|
+
# Copyright:: Copyright (c) 2010, Saul Caganoff
|
4
|
+
# License:: Creative Commons Attribution 3.0 Australia License (http://creativecommons.org/licenses/by/3.0/au/)
|
5
|
+
#
|
6
|
+
|
7
|
+
module Tables
|
8
|
+
|
9
|
+
class Table
|
10
|
+
|
11
|
+
attr_reader :table, :colindex, :idcolumn, :rowindex
|
12
|
+
attr_accessor :name
|
13
|
+
|
14
|
+
def initialize(table=nil,name=nil)
|
15
|
+
@table=[]
|
16
|
+
table.each {|row| @table<<row } unless table.nil?
|
17
|
+
build_column_index unless table.nil?
|
18
|
+
self.name=name unless name.nil?
|
19
|
+
end
|
20
|
+
|
21
|
+
def header
|
22
|
+
@table[0]
|
23
|
+
end
|
24
|
+
|
25
|
+
#def columns
|
26
|
+
# self.header.count
|
27
|
+
#end
|
28
|
+
|
29
|
+
def idcolumn=(column_name)
|
30
|
+
raise "Unknown column '#{column_name}'" unless @colindex.has_key?(column_name)
|
31
|
+
@idcolumn=column_name
|
32
|
+
build_row_index
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_value(column_name,row_num)
|
36
|
+
col_num=@colindex[column_name]
|
37
|
+
raise "Unknown column name '#{column_name}'" if col_num.nil?
|
38
|
+
@table[row_num][col_num]
|
39
|
+
end
|
40
|
+
|
41
|
+
def get_row(arg)
|
42
|
+
result=nil
|
43
|
+
if arg.is_a? Integer then
|
44
|
+
result=get_row_by_num(arg)
|
45
|
+
else
|
46
|
+
result=get_row_by_num(@rowindex[arg])
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def add_row(row)
|
51
|
+
add_row_array(row) if row.is_a?(Array)
|
52
|
+
add_row_hash(row) if (row.is_a?(Hash) and self.row_count>0)
|
53
|
+
add_first_row_hash(row) if (row.is_a?(Hash) and self.row_count==0)
|
54
|
+
end
|
55
|
+
|
56
|
+
def [](idx)
|
57
|
+
@table[idx]
|
58
|
+
end
|
59
|
+
|
60
|
+
def ==(other)
|
61
|
+
self.table==other.table
|
62
|
+
end
|
63
|
+
|
64
|
+
# obsolescent
|
65
|
+
def rows
|
66
|
+
puts "WARNING: use 'row_count' instead of 'rows'"
|
67
|
+
self.row_count
|
68
|
+
end
|
69
|
+
|
70
|
+
def column_count
|
71
|
+
self.header.count
|
72
|
+
end
|
73
|
+
|
74
|
+
def row_count
|
75
|
+
@table.count
|
76
|
+
end
|
77
|
+
|
78
|
+
def similar?(t2)
|
79
|
+
self.header==t2.header
|
80
|
+
end
|
81
|
+
|
82
|
+
def merge!(t2)
|
83
|
+
raise "tables are not similar" unless similar?(t2)
|
84
|
+
(1..t2.row_count-1).each do |idx|
|
85
|
+
begin
|
86
|
+
self<<t2[idx]
|
87
|
+
rescue Exception=>e
|
88
|
+
puts "ERROR: Error adding row #{idx} from '#{t2.name}' to '#{self.name}'"
|
89
|
+
puts "ERROR: #{e.message}"
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# << is a synonym of add_row
|
95
|
+
def <<(row)
|
96
|
+
self.add_row(row)
|
97
|
+
end
|
98
|
+
|
99
|
+
def each_row
|
100
|
+
@table.each {|row| yield(row)}
|
101
|
+
end
|
102
|
+
|
103
|
+
def each_row_with_index
|
104
|
+
@table.each_with_index {|row,idx| yield(row,idx) }
|
105
|
+
end
|
106
|
+
|
107
|
+
def get_each_row(skip_header=true)
|
108
|
+
@table.each_with_index {|row,idx| yield(self.get_row(idx)) unless (idx==0 and skip_header)}
|
109
|
+
end
|
110
|
+
|
111
|
+
# TODO: How can we do this within the get_row closure?
|
112
|
+
def set_row(row)
|
113
|
+
idx=row[:rownum]
|
114
|
+
row.each_pair do |colname,value|
|
115
|
+
jdx=@colindex[colname]
|
116
|
+
@table[idx][jdx]=value unless jdx.nil?
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def remove_blank_rows!(startcol=0)
|
121
|
+
remove_matched_rows! { |row| row[startcol..-1].join.strip=="" }
|
122
|
+
end
|
123
|
+
|
124
|
+
def remove_repeat_headers!
|
125
|
+
header=self.header
|
126
|
+
remove_matched_rows! { |row| row.eql?(header) and not(row.equal?(header)) }
|
127
|
+
end
|
128
|
+
|
129
|
+
def remove_matched_rows!
|
130
|
+
blanks=[]
|
131
|
+
@table.each do |row|
|
132
|
+
blanks << row if yield(row)
|
133
|
+
end
|
134
|
+
blanks.each {|br| @table.delete_if {|el| el.equal?(br) }} # delete if objects are the same
|
135
|
+
build_column_index
|
136
|
+
return self
|
137
|
+
end
|
138
|
+
|
139
|
+
def demerge!(colnum=0)
|
140
|
+
new_table=[@table[0]]
|
141
|
+
(1..@table.count-1).each do |idx|
|
142
|
+
demerge_it(colnum,idx, new_table)
|
143
|
+
end
|
144
|
+
@table=new_table
|
145
|
+
return self
|
146
|
+
end
|
147
|
+
|
148
|
+
def rename_column(old_name, new_name)
|
149
|
+
colnum=@colindex[old_name]
|
150
|
+
raise "Unknown column '#{name}'" if colnum.nil?
|
151
|
+
@colindex[new_name]=colnum
|
152
|
+
@colindex.delete(old_name)
|
153
|
+
@table[0][colnum]=new_name
|
154
|
+
end
|
155
|
+
|
156
|
+
def delete_column(name)
|
157
|
+
colnum=@colindex[name]
|
158
|
+
raise "Unknown column '#{name}'" if colnum.nil?
|
159
|
+
self.each_row {|row| row.delete_at(colnum)}
|
160
|
+
@colindex.delete(name)
|
161
|
+
end
|
162
|
+
|
163
|
+
def get_column(name)
|
164
|
+
colnum=@colindex[name]
|
165
|
+
raise "Unknown column '#{name}'" if colnum.nil?
|
166
|
+
result=[]
|
167
|
+
self.each_row {|row| result<<row[colnum]}
|
168
|
+
result
|
169
|
+
end
|
170
|
+
|
171
|
+
def column_copy(other_table)
|
172
|
+
other_table.get_each_row do |other_row|
|
173
|
+
self.add_row(other_row)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def signature
|
178
|
+
self.header.join(',')
|
179
|
+
end
|
180
|
+
|
181
|
+
private
|
182
|
+
|
183
|
+
def add_row_array(row)
|
184
|
+
raise "Argument must be an array" unless row.is_a? Array
|
185
|
+
unless self.header.nil? then
|
186
|
+
n=self.header.count
|
187
|
+
raise "Row '#{row[0]}' must have #{n} values...found only #{row.count}" unless row.count==n
|
188
|
+
end
|
189
|
+
@table<<row
|
190
|
+
build_column_index if @table.count==1
|
191
|
+
end
|
192
|
+
|
193
|
+
def add_row_hash(row)
|
194
|
+
raise "Argument must be a hash table" unless row.is_a? Hash
|
195
|
+
n=self.header.count unless self.header.nil?
|
196
|
+
new_row=Array.new(n)
|
197
|
+
row.each_pair do |k,v|
|
198
|
+
idx=@colindex[k]
|
199
|
+
new_row[idx]=v unless idx.nil?
|
200
|
+
end
|
201
|
+
@table<<new_row
|
202
|
+
end
|
203
|
+
|
204
|
+
def add_first_row_hash(row)
|
205
|
+
raise "Argument must be a hash table" unless row.is_a? Hash
|
206
|
+
new_header=[]
|
207
|
+
new_row=[]
|
208
|
+
row.each_pair do |k,v|
|
209
|
+
new_header<<k
|
210
|
+
new_row<<v
|
211
|
+
end
|
212
|
+
add_row_array(new_header)
|
213
|
+
add_row_array(new_row)
|
214
|
+
end
|
215
|
+
|
216
|
+
def build_row_index
|
217
|
+
@rowindex={}
|
218
|
+
self.get_each_row do |row|
|
219
|
+
id=row[@idcolumn]
|
220
|
+
@rowindex[id]=row[:rownum]
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def build_column_index
|
225
|
+
@colindex={}
|
226
|
+
self.header.each_with_index {|value,idx| @colindex[value]=idx }
|
227
|
+
end
|
228
|
+
|
229
|
+
def get_row_by_num(row_num)
|
230
|
+
row=@table[row_num]
|
231
|
+
result={}
|
232
|
+
row.each_with_index {|v,idx| result[self.header[idx]]=v }
|
233
|
+
result[:rownum]=row_num
|
234
|
+
result
|
235
|
+
end
|
236
|
+
|
237
|
+
def demerge_it(colnum,idx,new_table)
|
238
|
+
next_row=@table[idx]
|
239
|
+
if next_row[colnum].strip=="" then
|
240
|
+
row=new_table.pop
|
241
|
+
demerged_row=demerge_two_rows(row, next_row)
|
242
|
+
new_table.push(demerged_row)
|
243
|
+
else
|
244
|
+
new_table.push(next_row)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
|
248
|
+
def demerge_two_rows(r1, r2)
|
249
|
+
raise "Column number mismatch" if r1.count != r2.count
|
250
|
+
new_row=[]
|
251
|
+
(0..r1.count-1).each do |idx|
|
252
|
+
new_cell=r1[idx]+"\n"+r2[idx]
|
253
|
+
new_row << new_cell.strip
|
254
|
+
end
|
255
|
+
new_row
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
end
|