yomise 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/yomise.rb CHANGED
@@ -1,290 +1,321 @@
1
- # frozen_string_literal: true
2
- require "csv"
3
- require "roo-xls"
4
- require "spreadsheet"
5
- require "rover"
6
- require "daru"
7
- require_relative "./to_csv"
8
- require_relative "./longest_line"
9
- require_relative "yomise/version"
10
-
11
- module Yomise
12
- class Error < StandardError; end
13
-
14
- module_function
15
-
16
- def read(path, **opt)
17
- return /csv$/i === path ? read_csv(path, **opt) : read_excel(path, **opt)
18
- end
19
-
20
- # ##Generate Array from CSV File, and convert it to Hash or DataFrame.
21
- # **opt candidate= line_from: 1, header: 0
22
- # ver. 0.3.8~ default format=:daru
23
- def read_csv(path, format: :daru, encoding: "utf-8", liberal_parsing: true, col_sep: ",", index: nil, **opt)
24
- ## TODO.. index: option that designate column number to generate DF index.
25
- ## That is, revicing set_index method.
26
-
27
- # Get 2D Array
28
- begin
29
- if liberal_parsing
30
- csvd = CSV.read(path, encoding: encoding, liberal_parsing: true)
31
- if encoding.to_s.downcase != "utf-8"
32
- csv = csvd.to_a.map {|l| l.map {|cell| cell.nil? ? nil : cell.encode("utf-8", invalid: :replace, replace: '') }}
33
- else
34
- csv = csvd
35
- end
36
-
37
- encoding = "utf-8"
38
- else
39
- # Old style (Not Recommended)
40
- # This "&:read" is not Yomise's function(defined avobe here).. parhaps File's method.
41
- csv = CSV.parse(File.open(path, encoding: encoding, &:read), col_sep: col_sep)
42
- end
43
- rescue
44
- # Try Another Encoding
45
- ## puts "Fail Encoding #{encoding}. Trying cp932..."
46
- if liberal_parsing
47
- csvd = CSV.read(path, encoding: "cp932", liberal_parsing: true)
48
- if encoding.to_s.downcase != "utf-8"
49
- csv = csvd.to_a.map {|l| l.map {|cell| cell.nil? ? nil : cell.encode("utf-8", invalid: :replace, replace: '') }}
50
- else
51
- csv = csvd
52
- end
53
-
54
- encoding = "UTF-8"
55
- else
56
- # Old style (Not Recommended)
57
- # This "&:read" is not Yomise's function(defined avobe here).. parhaps File's method.
58
- csv = CSV.parse(File.open(path, encoding: "cp932", &:read), col_sep: col_sep)
59
- end
60
- encoding = "cp932"
61
- end
62
-
63
- if format.to_s == "array"
64
- return csv
65
- elsif format.to_s == "hash"
66
- h, i = to_hash(csv, **opt)
67
- return h
68
- elsif format.to_s == "csv"
69
- return csv.to_csv
70
- elsif format.to_s == "numo"
71
- return csv # Under Construction
72
- else # include format.nil? (in this case, convert to Daru::DF).
73
-
74
- h, ind_orig = to_hash(csv, index: index, **opt)
75
- ans = to_df(h, format: format)
76
-
77
- # Converting Encode and Setting index.. rover not supported yet
78
- if format.to_s == "daru" || format.nil?
79
- ans.convert_enc!(from: encoding, to: "utf-8") if encoding.to_s.downcase != "utf-8"
80
- begin
81
- ans.index = ind_orig if index
82
- rescue
83
- warn "Indexing failed (Parhaps due to duplicated index)."
84
- end
85
- end
86
-
87
- return ans
88
- end
89
- end
90
-
91
- # ##Generate Array from EXCEL File, and convert it to Hash or DataFrame.
92
- # **opt candidate= line_from: 1, header: 0)
93
- def read_excel(path, sheet_i: 0, format: :daru, encoding: "utf-8", index: nil, **opt)
94
- a2d = open_excel(path, sheet_i, encoding: encoding) # Get 2D Array
95
-
96
- if format.to_s == "array"
97
- return a2d
98
- elsif format.to_s == "hash"
99
- h, i = to_hash(a2d, **opt)
100
- return h
101
- elsif format.to_s == "csv"
102
- return a2d.to_csv
103
- elsif format.to_s == "numo"
104
- return a2d # Under Construction
105
- else # include format.nil?
106
- h, ind_orig = to_hash(a2d, index: index, **opt)
107
- ans = to_df(h, format: format)
108
- if format.to_s == "daru" || format.nil?
109
- begin
110
- ans.index = ind_orig if index
111
- rescue
112
- warn "Indexing failed (Parhaps due to duplicated index)."
113
- end
114
- end
115
- return ans
116
- end
117
- end
118
-
119
- # Convert 2d Array to Hash
120
- ## header: nil -> Default Headers(:column1, column2,...) are generated.
121
- ## Option line_ignored, is not implemented yet.
122
- def to_hash(array2d, line_from: 1, line_until: nil, line_ignored: nil,
123
- column_from: nil, column_until: nil,
124
- header: 0, symbol_header: false,
125
- replaced_by_nil: [], analyze_type: true,
126
- index: nil)
127
- ## TODO.. column_from: , column_until:
128
-
129
- # Define Read Range------------
130
- lfrom, luntil = line_from, line_until
131
- lf_reg, lu_reg = line_from.kind_of?(Regexp), line_until.kind_of?(Regexp)
132
-
133
- if lf_reg || lu_reg
134
- lines_ary = array2d.map{ _1.join "," }
135
- lfrom = lines_ary.find_index{ line_from === _1 } if lf_reg
136
- luntil = (lines_ary.length-1) - lines_ary.reverse.find_index{ line_until === _1 } if lu_reg
137
- end
138
-
139
- # And get originally array-----
140
- output = array2d[lfrom...luntil]
141
- # -----------------------------
142
-
143
- # Then get data of index-------
144
- ind_orig = index ? output.map{ _1[index] } : nil
145
- # -----------------------------
146
-
147
- # Selecct Column---------------
148
- output = output.map { _1[column_from...column_until] } if column_from || column_until
149
-
150
- # Define Data Array------------
151
- output_transpose = output[0].zip(*output[1..])
152
- output_transpose = fix_array(output_transpose, replaced_by_nil, analyze_type)
153
- # -----------------------------
154
-
155
- # Define Header----------------
156
- if header
157
- hd = check_header(array2d[header])[column_from...column_until]
158
- else
159
- hd = [*0...(output.longest_line)].map{"column#{_1}"}
160
- end
161
- # hd = header.nil? ? [*0...(output.longest_line)].map{"column#{_1}"} : check_header(array2d[header])
162
-
163
- hd = hd.map { _1.intern } if symbol_header
164
- # -----------------------------
165
-
166
- # Make Hash(Header => Data Array)
167
- return hd.each_with_object({}).with_index {|(hdr, hash), i| hash[hdr]=output_transpose[i]}, ind_orig
168
- end
169
-
170
- # Convert Hash to DataFrame
171
- def to_df(d, format: :daru)
172
- if format.to_s == "daru" || format.nil?
173
- Daru::DataFrame.new(d)
174
- else
175
- Rover::DataFrame.new(d)
176
- end
177
- end
178
-
179
- #----------------------------
180
- # Private metods from here
181
- #----------------------------
182
-
183
- # Genarate Array from excel file
184
- def open_excel(path, sheet_i, encoding: "utf-8")
185
- if /xls(x|m)$/ === path
186
- puts "Sorry, encoding option is not supported yet for xlsx file." if encoding != "utf-8"
187
-
188
- book = Roo::Excelx.new(path)
189
- s = book.sheet(sheet_i)
190
-
191
- ## bottole neck
192
- return s.to_a
193
-
194
- # xls
195
- else
196
- begin
197
- Spreadsheet.client_encoding = encoding
198
- ss = Spreadsheet.open(path)
199
- rescue Encoding::InvalidByteSequenceError
200
- puts "Fail Encoding #{encoding}. Trying Windows-31J..."
201
- Spreadsheet.client_encoding = "Windows-31J"
202
- ss = Spreadsheet.open(path)
203
- end
204
-
205
- a2d = []
206
- ss.worksheets[sheet_i].rows.each do |row|
207
- a1d = []
208
- row.each {|cell| a1d.push cell}
209
- a2d.push a1d
210
- end
211
-
212
- return a2d
213
- end
214
- end
215
-
216
- # Fix Array (Replace specific values to nil, recognize value type and cast values to the type.)
217
- def fix_array(array2d, replaced_by_nil, analyze_type)
218
- ans = array2d
219
-
220
- ## Replace Blank or User-Selected Value
221
- ans = ans.map do |column|
222
- column.map { |cell| replaced_by_nil.include?(cell) || /^\s*$/ === cell ? nil : cell }
223
- end
224
-
225
- ## Replace Number Values to Integer or Float
226
- if analyze_type
227
- ans = ans.map.with_index do |column, i|
228
- type_of_column = :any
229
- column.each { |cell| type_of_column = recognize_type(cell, type_of_column) }
230
-
231
- # p type_of_column
232
- case type_of_column
233
- when :int
234
- column.map { _1.nil? ? nil : _1.to_i }
235
- when :float
236
- column.map { _1.nil? ? nil : _1.to_f }
237
- else
238
- column
239
- end
240
- end
241
- end
242
-
243
- return ans
244
- end
245
-
246
- def recognize_type(str, expected)
247
- return expected if str.nil?
248
-
249
- order = {:any => 0, :int => 1, :float => 2, :string => 3}
250
- if /^\s*(-|\+)?\d+\s*$/ === str
251
- type_of_str = :int
252
- elsif /^\s*(-|\+)?\d*\.\d*\s*$/ === str || /^\s*(-|\+)?(\d*\.\d+|\d+)(e|E)(-|\+)?\d+\s*$/ === str
253
- type_of_str = :float
254
- else
255
- type_of_str = :string
256
- end
257
-
258
- # p "#{type_of_str}, #{str}" if order[type_of_str] > order[expected]
259
-
260
- return order[type_of_str] > order[expected] ? type_of_str : expected
261
- end
262
-
263
- # Fix blank or duplicated header
264
- def check_header(header_array)
265
- # Check Blank
266
- ans = header_array.map.with_index do |item, i|
267
- if item.nil?
268
- "column#{i}"
269
- elsif item.kind_of?(String)
270
- temp = /^\s*$/ === item ? "column#{i}" : item.gsub(/\s+/, "")
271
- /^\d+$/ === temp ? "column#{i}" : temp
272
- else
273
- item.to_s
274
- end
275
- end
276
-
277
- # Check Duplicated Value
278
- dup_check = (0...(header_array.length)).group_by {|i| ans[i]}
279
- dup_check.each do |item, i_s|
280
- if i_s.length > 1
281
- i_s.each_with_index {|i, index_in_i_s| ans[i] = "#{ans[i]}_#{index_in_i_s}"}
282
- end
283
- end
284
-
285
- return ans
286
- end
287
-
288
- private_class_method :open_excel, :fix_array, :check_header
289
-
290
- end
1
+ # frozen_string_literal: true
2
+ require "csv"
3
+ require "roo-xls"
4
+ require "spreadsheet"
5
+ require "rover"
6
+ require "daru"
7
+ require_relative "./to_csv"
8
+ require_relative "./longest_line"
9
+ require_relative "yomise/version"
10
+
11
+ module Yomise
12
+ class Error < StandardError; end
13
+
14
+ module_function
15
+
16
+ def read(path, **opt)
17
+ return /csv$/i === path ? read_csv(path, **opt) : read_excel(path, **opt)
18
+ end
19
+
20
+ # ##Generate Array from CSV File, and convert it to Hash or DataFrame.
21
+ # **opt candidate= line_from: 1, header: 0
22
+ def read_csv(path, format: :rover, encoding: "utf-8", liberal_parsing: true, reconvert_utf8: false, col_sep: ",", index: nil, **opt)
23
+ ## TODO.. index: option that designate column number to generate DF index.
24
+ ## That is, revicing set_index method.
25
+
26
+ # Get 2D Array
27
+ begin
28
+ if liberal_parsing
29
+ csvd = CSV.read(path, encoding: encoding, liberal_parsing: true)
30
+ if encoding.to_s.downcase != "utf-8"
31
+ csv = csvd.to_a.map {|l| l.map {|cell| cell.nil? ? nil : cell.encode("utf-8", invalid: :replace, replace: '') }}
32
+ else
33
+ csv = csvd
34
+ end
35
+
36
+ encoding = "utf-8"
37
+ else
38
+ # Old style (Not Recommended)
39
+ # This "&:read" is not Yomise's function(defined avobe here).. parhaps File's method.
40
+ csv = CSV.parse(File.open(path, encoding: encoding, &:read), col_sep: col_sep)
41
+ end
42
+ rescue
43
+ # Try Another Encoding
44
+ ## puts "Fail Encoding #{encoding}. Trying cp932..."
45
+ if liberal_parsing
46
+ csvd = CSV.read(path, encoding: "cp932", liberal_parsing: true)
47
+ if encoding.to_s.downcase != "utf-8"
48
+ csv = csvd.to_a.map {|l| l.map {|cell| cell.nil? ? nil : cell.encode("utf-8", invalid: :replace, replace: '') }}
49
+ else
50
+ csv = csvd
51
+ end
52
+
53
+ encoding = "UTF-8"
54
+ else
55
+ # Old style (Not Recommended)
56
+ # This "&:read" is not Yomise's function(defined avobe here).. parhaps File's method.
57
+ csv = CSV.parse(File.open(path, encoding: "cp932", &:read), col_sep: col_sep)
58
+ end
59
+ encoding = "cp932"
60
+ end
61
+
62
+ if reconvert_utf8
63
+ csv = csv.map {|l| l.map {|cell| cell.nil? ? nil : cell.encode("UTF-8")}}
64
+ end
65
+
66
+ if format.to_s == "array"
67
+ return csv
68
+ elsif format.to_s == "hash"
69
+ h, i = to_hash(csv, **opt)
70
+ return h
71
+ elsif format.to_s == "csv"
72
+ return csv.to_csv
73
+ elsif format.to_s == "numo"
74
+ return csv # Under Construction
75
+ else # include format.nil? (in this case, convert to Daru::DF).
76
+
77
+ h, ind_orig = to_hash(csv, index: index, **opt)
78
+ ans = to_df(h, format: format)
79
+
80
+ # Converting Encode and Setting index.. rover not supported yet
81
+ if format.to_s == "daru"
82
+ ans.convert_enc!(from: encoding, to: "utf-8") if encoding.to_s.downcase != "utf-8"
83
+ begin
84
+ ans.index = ind_orig if index
85
+ rescue
86
+ warn "Indexing failed (Parhaps due to duplicated index)."
87
+ end
88
+ end
89
+
90
+ return ans
91
+ end
92
+ end
93
+
94
+ # ##Generate Array from EXCEL File, and convert it to Hash or DataFrame.
95
+ # **opt candidate= line_from: 1, header: 0)
96
+ def read_excel(path, sheet_i: 0, format: :rover, encoding: "utf-8", index: nil, **opt)
97
+ a2d = open_excel(path, sheet_i, encoding: encoding) # Get 2D Array
98
+
99
+ if format.to_s == "array"
100
+ return a2d
101
+ elsif format.to_s == "hash"
102
+ h, i = to_hash(a2d, **opt)
103
+ return h
104
+ elsif format.to_s == "csv"
105
+ return a2d.to_csv
106
+ elsif format.to_s == "numo"
107
+ return a2d # Under Construction
108
+ else # include format.nil?
109
+ h, ind_orig = to_hash(a2d, index: index, **opt)
110
+ ans = to_df(h, format: format)
111
+ if format.to_s == "daru"
112
+ begin
113
+ ans.index = ind_orig if index
114
+ rescue
115
+ warn "Indexing failed (Parhaps due to duplicated index)."
116
+ end
117
+ end
118
+ return ans
119
+ end
120
+ end
121
+
122
+ # Convert 2d Array to Hash
123
+ ## header: nil -> Default Headers(:column1, column2,...) are generated.
124
+ ## Option line_ignored, is not implemented yet.
125
+ def to_hash(array2d, line_from: 1, line_until: nil, line_ignored: nil,
126
+ column_from: nil, column_until: nil,
127
+ header: 0, symbol_header: false,
128
+ replaced_by_nil: [], analyze_type: true,
129
+ index: nil)
130
+ ## TODO.. column_from: , column_until:
131
+
132
+ # Define Read Range------------
133
+ lfrom, luntil = line_from, line_until
134
+ lf_reg, lu_reg = line_from.kind_of?(Regexp), line_until.kind_of?(Regexp)
135
+
136
+ if lf_reg || lu_reg
137
+ lines_ary = array2d.map{ _1.join "," }
138
+ lfrom = lines_ary.find_index{ line_from === _1 } if lf_reg
139
+ luntil = (lines_ary.length-1) - lines_ary.reverse.find_index{ line_until === _1 } if lu_reg
140
+ end
141
+
142
+ # And get originally array-----
143
+ output = array2d[lfrom...luntil]
144
+ # -----------------------------
145
+
146
+ # Then get data of index-------
147
+ ind_orig = index ? output.map{ _1[index] } : nil
148
+ # -----------------------------
149
+
150
+ # Selecct Column---------------
151
+ output = output.map { _1[column_from...column_until] } if column_from || column_until
152
+
153
+ # Define Data Array------------
154
+ output_transpose = output[0].zip(*output[1..])
155
+ output_transpose = fix_array(output_transpose, replaced_by_nil, analyze_type)
156
+ # -----------------------------
157
+
158
+ # Define Header----------------
159
+ if header
160
+ hd = check_header(array2d[header])[column_from...column_until]
161
+ else
162
+ hd = [*0...(output.longest_line)].map{"column#{_1}"}
163
+ end
164
+ # hd = header.nil? ? [*0...(output.longest_line)].map{"column#{_1}"} : check_header(array2d[header])
165
+
166
+ hd = hd.map { _1.intern } if symbol_header
167
+ # -----------------------------
168
+
169
+ # Make Hash(Header => Data Array)
170
+ return hd.each_with_object({}).with_index {|(hdr, hash), i| hash[hdr]=output_transpose[i]}, ind_orig
171
+ end
172
+
173
+ # Convert Hash to DataFrame
174
+ def to_df(d, format: :rover)
175
+ if format.to_s == "daru"
176
+ Daru::DataFrame.new(d)
177
+ else
178
+ Rover::DataFrame.new(d)
179
+ end
180
+ end
181
+
182
+ # Rover用: nil や nanを取り除くマスク、または二値の列(true-false value)生成
183
+ def is_available(value, truevalue: true, falsevalue: false, blank_str_is_false: true)
184
+ if value.nil?
185
+ falsevalue
186
+ else
187
+ if value.is_a? Numeric
188
+ !value.nan? ? truevalue : falsevalue
189
+ elsif value.is_a? String
190
+ if value == ""
191
+ blank_str_is_false ? falsevalue : truevalue
192
+ else
193
+ truevalue
194
+ end
195
+ else
196
+ truevalue
197
+ end
198
+ end
199
+ end
200
+
201
+ def available(data, truevalue: true, falsevalue: false, blank_str_is_false: true)
202
+ if data.is_a? Rover::Vector
203
+ data.map { |v| is_available(v, truevalue: truevalue, falsevalue: falsevalue, blank_str_is_false: blank_str_is_false) }
204
+ elsif data.is_a? Rover::DataFrame
205
+ dfdata = data.keys.map {|k| data[k].map { |d| is_available(d, truevalue: truevalue, falsevalue: falsevalue, blank_str_is_false: blank_str_is_false) } }
206
+ Rover::DataFrame.new(data.keys.zip(dfdata).map{[_1, _2]}.to_h)
207
+ end
208
+ end
209
+
210
+ #----------------------------
211
+ # Private metods from here
212
+ #----------------------------
213
+
214
+ # Genarate Array from excel file
215
+ def open_excel(path, sheet_i, encoding: "utf-8")
216
+ if /xls(x|m)$/ === path
217
+ puts "Sorry, encoding option is not supported yet for xlsx file." if encoding != "utf-8"
218
+
219
+ book = Roo::Excelx.new(path)
220
+ s = book.sheet(sheet_i)
221
+
222
+ ## bottole neck
223
+ return s.to_a
224
+
225
+ # xls
226
+ else
227
+ begin
228
+ Spreadsheet.client_encoding = encoding
229
+ ss = Spreadsheet.open(path)
230
+ rescue Encoding::InvalidByteSequenceError
231
+ puts "Fail Encoding #{encoding}. Trying Windows-31J..."
232
+ Spreadsheet.client_encoding = "Windows-31J"
233
+ ss = Spreadsheet.open(path)
234
+ end
235
+
236
+ a2d = []
237
+ ss.worksheets[sheet_i].rows.each do |row|
238
+ a1d = []
239
+ row.each {|cell| a1d.push cell}
240
+ a2d.push a1d
241
+ end
242
+
243
+ return a2d
244
+ end
245
+ end
246
+
247
+ # Fix Array (Replace specific values to nil, recognize value type and cast values to the type.)
248
+ def fix_array(array2d, replaced_by_nil, analyze_type)
249
+ ans = array2d
250
+
251
+ ## Replace Blank or User-Selected Value
252
+ ans = ans.map do |column|
253
+ column.map { |cell| replaced_by_nil.include?(cell) || /^\s*$/ === cell ? nil : cell }
254
+ end
255
+
256
+ ## Replace Number Values to Integer or Float
257
+ if analyze_type
258
+ ans = ans.map.with_index do |column, i|
259
+ type_of_column = :any
260
+ column.each { |cell| type_of_column = recognize_type(cell, type_of_column) }
261
+
262
+ # p type_of_column
263
+ case type_of_column
264
+ when :int
265
+ column.map { _1.nil? ? nil : _1.to_i }
266
+ when :float
267
+ column.map { _1.nil? ? nil : _1.to_f }
268
+ else
269
+ column
270
+ end
271
+ end
272
+ end
273
+
274
+ return ans
275
+ end
276
+
277
+ def recognize_type(str, expected)
278
+ return expected if str.nil?
279
+
280
+ order = {:any => 0, :int => 1, :float => 2, :string => 3}
281
+ if /^\s*(-|\+)?\d+\s*$/ === str
282
+ type_of_str = :int
283
+ elsif /^\s*(-|\+)?\d*\.\d*\s*$/ === str || /^\s*(-|\+)?(\d*\.\d+|\d+)(e|E)(-|\+)?\d+\s*$/ === str
284
+ type_of_str = :float
285
+ else
286
+ type_of_str = :string
287
+ end
288
+
289
+ # p "#{type_of_str}, #{str}" if order[type_of_str] > order[expected]
290
+
291
+ return order[type_of_str] > order[expected] ? type_of_str : expected
292
+ end
293
+
294
+ # Fix blank or duplicated header
295
+ def check_header(header_array)
296
+ # Check Blank
297
+ ans = header_array.map.with_index do |item, i|
298
+ if item.nil?
299
+ "column#{i}"
300
+ elsif item.kind_of?(String)
301
+ temp = /^\s*$/ === item ? "column#{i}" : item.gsub(/\s+/, "")
302
+ /^\d+$/ === temp ? "column#{i}" : temp
303
+ else
304
+ item.to_s
305
+ end
306
+ end
307
+
308
+ # Check Duplicated Value
309
+ dup_check = (0...(header_array.length)).group_by {|i| ans[i]}
310
+ dup_check.each do |item, i_s|
311
+ if i_s.length > 1
312
+ i_s.each_with_index {|i, index_in_i_s| ans[i] = "#{ans[i]}_#{index_in_i_s}"}
313
+ end
314
+ end
315
+
316
+ return ans
317
+ end
318
+
319
+ private_class_method :open_excel, :fix_array, :check_header
320
+
321
+ end
data/sig/yomise.rbs CHANGED
@@ -1,4 +1,4 @@
1
- module Yomise
2
- VERSION: String
3
- # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
- end
1
+ module Yomise
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end