import_csv 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/import_csv.rb +748 -0
- metadata +44 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4b331c5ae2a5fd009f9679efbb6d5b69c52c3555
|
4
|
+
data.tar.gz: 2cf6adef2752499a5e013db244319e31cb97a655
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fcd5b98e5261cded35cab1d87fd6b487c5b8bcd93bc693c7a1840ab6a225793cf7a93be1a04b5b0fada631d5de275380c239507bafc8623721ca1a63b3269692
|
7
|
+
data.tar.gz: b7b13d3b3ddc5c8c2868ecf92268da30c3ef5bb3dd6867f7fb60a9505511a8922d20d9ec9dbddcfa78edb1e3314bbc3b009223747cdcdff4f197dd3bc79a75f5
|
data/lib/import_csv.rb
ADDED
@@ -0,0 +1,748 @@
|
|
1
|
+
# Kasyfil Aziz Tri Cahyana <tricahyana@windowslive.com> <kasyfil.aziz@wgs.co.id> 2016
|
2
|
+
#
|
3
|
+
# require all file below in yours Ruby on Rails Application
|
4
|
+
# lib/import_csv/import.rb
|
5
|
+
#
|
6
|
+
# This library can make import data from large csv (>10M) faster and very low
|
7
|
+
# memory usage (depend on preload data setting).
|
8
|
+
#
|
9
|
+
# This library will get data line by line and parse to array, it's using CSV.parse
|
10
|
+
# but more efficient. Rather than parse line by line, this library will load some
|
11
|
+
# line to memory first and parse to array. It called preload data. Default
|
12
|
+
# preload data is 1000, but you can define preload data by your own. You can also
|
13
|
+
# configure parse option like CSV parse option in standart ruby library.
|
14
|
+
#
|
15
|
+
# Example :
|
16
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
17
|
+
# while csv.next
|
18
|
+
# p csv.current[0]
|
19
|
+
# p csv.current[1]
|
20
|
+
# ... your code ...
|
21
|
+
# end
|
22
|
+
#
|
23
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
24
|
+
# csv.each do |line|
|
25
|
+
# p line[0]
|
26
|
+
# p line[1]
|
27
|
+
# ... your code ...
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv')) do |line|
|
31
|
+
# p line[0]
|
32
|
+
# p line[1]
|
33
|
+
# ... your code ...
|
34
|
+
# end
|
35
|
+
#
|
36
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true) do |line|
|
37
|
+
# p line['location_id']
|
38
|
+
# p line['location_name']
|
39
|
+
# ... your code ...
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
|
43
|
+
# while csv.next
|
44
|
+
# p csv.location_id
|
45
|
+
# p csv.location_name
|
46
|
+
# ... your code ...
|
47
|
+
# end
|
48
|
+
#
|
49
|
+
# - Setting preload data. You can setting preload data by set preload attribute
|
50
|
+
# or in ImportCSV.new
|
51
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
52
|
+
# csv.preload = 2000
|
53
|
+
# csv.each do |line|
|
54
|
+
# p line[0]
|
55
|
+
# p line[1]
|
56
|
+
# ... your code ...
|
57
|
+
# end
|
58
|
+
#
|
59
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), preload: 2000)
|
60
|
+
# csv.each do |line|
|
61
|
+
# p line[0]
|
62
|
+
# p line[1]
|
63
|
+
# ... your code ...
|
64
|
+
# end
|
65
|
+
#
|
66
|
+
# - Setting Automatic header, get header from first row in file and ignore
|
67
|
+
# first row.
|
68
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true, preload: 2000)
|
69
|
+
# csv.each do
|
70
|
+
# p csv.loation_id
|
71
|
+
# p csv.location_name
|
72
|
+
# ... your code ...
|
73
|
+
# end
|
74
|
+
#
|
75
|
+
# - Define header by your self. If you define header your self, this script
|
76
|
+
# will not ignore first row in file.
|
77
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: ['location_id', 'location_name'])
|
78
|
+
# csv.each do
|
79
|
+
# p csv.location_id
|
80
|
+
# p csv.location_name
|
81
|
+
# ... your code ...
|
82
|
+
# end
|
83
|
+
#
|
84
|
+
# - Define header by instance method.
|
85
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
86
|
+
# csv.set_header ['location_id', 'location_name']
|
87
|
+
# csv.each do
|
88
|
+
# p csv.location_id
|
89
|
+
# p csv.location_name
|
90
|
+
# ... your code ...
|
91
|
+
# end
|
92
|
+
#
|
93
|
+
# - Call data using hash. Header must be set.
|
94
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
95
|
+
# csv.set_header ['location_id', 'location_name']
|
96
|
+
# csv.each do
|
97
|
+
# p csv['location_id']
|
98
|
+
# p csv['location_name']
|
99
|
+
# ... your code ...
|
100
|
+
# end
|
101
|
+
#
|
102
|
+
# - Setting parse option. (Read: http://ruby-doc.org/stdlib-2.0.0/libdoc/csv/rdoc/CSV.html#class-CSV-label-CSV+and+Character+Encodings+-28M17n+or+Multilingualization-29)
|
103
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true, parse_options: {col_sep: ';', quote_char: '"'})
|
104
|
+
# csv.each do |line|
|
105
|
+
# p line['location_id']
|
106
|
+
# p line['location_name']
|
107
|
+
# ... your code ...
|
108
|
+
# end
|
109
|
+
#
|
110
|
+
# - Pararel processing. Instead loop through preload data, each method can pass
|
111
|
+
# array of preload data so you can send to background process like Sidekiq.
|
112
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true, return_preload_only: true)
|
113
|
+
# csv.each_preload do |preload_data|
|
114
|
+
# CsvWorker.perform_async(preload_data)
|
115
|
+
# end
|
116
|
+
#
|
117
|
+
# - Filter data. More example, see documenttation in `where` mothode below.
|
118
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
|
119
|
+
# csv.where(location_name: 'jakarta')
|
120
|
+
# csv.each do |line|
|
121
|
+
# p line.location_id
|
122
|
+
# p line.location_name
|
123
|
+
# ... your code ...
|
124
|
+
# end
|
125
|
+
#
|
126
|
+
|
127
|
+
require 'csv'
|
128
|
+
|
129
|
+
class ImportCSV
|
130
|
+
# preload data
|
131
|
+
attr_accessor :preload
|
132
|
+
|
133
|
+
# current line number in file
|
134
|
+
attr_accessor :line_count
|
135
|
+
|
136
|
+
# file path (string)
|
137
|
+
attr_accessor :file_path
|
138
|
+
|
139
|
+
# file object (File)
|
140
|
+
attr_accessor :file_object
|
141
|
+
|
142
|
+
# header (Array)
|
143
|
+
attr_accessor :header
|
144
|
+
|
145
|
+
# header (Boolean)
|
146
|
+
attr_accessor :has_header
|
147
|
+
attr_accessor :define_header_by_your_self
|
148
|
+
|
149
|
+
# current line in csv file if header has been define
|
150
|
+
attr_accessor :current
|
151
|
+
|
152
|
+
# current preload data
|
153
|
+
attr_accessor :current_preload
|
154
|
+
|
155
|
+
# Boolean. If true, will loop through file and send current preload data to
|
156
|
+
# block function
|
157
|
+
attr_accessor :return_preload_only
|
158
|
+
|
159
|
+
# set parse options
|
160
|
+
attr_accessor :parse_options
|
161
|
+
|
162
|
+
attr_accessor :query
|
163
|
+
|
164
|
+
attr_accessor :background_task
|
165
|
+
|
166
|
+
attr_accessor :next_preload
|
167
|
+
|
168
|
+
attr_accessor :file_eof
|
169
|
+
|
170
|
+
# Class constructor.
|
171
|
+
# set file path and preload data
|
172
|
+
# if block given, then will call `each` so you can add block line using `each`
|
173
|
+
#
|
174
|
+
# Example :
|
175
|
+
# ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv')) do |line|
|
176
|
+
# p line.current[0]
|
177
|
+
# p line.current[1]
|
178
|
+
# ... your code ...
|
179
|
+
# end
|
180
|
+
#
|
181
|
+
def initialize(file_path, options = Hash.new)
|
182
|
+
self.preload = options[:preload] || 1000
|
183
|
+
self.line_count = 1
|
184
|
+
self.file_path = file_path
|
185
|
+
self.parse_options = options[:parse_options] || {}
|
186
|
+
self.return_preload_only = options[:return_preload_only] || false
|
187
|
+
self.file_object = File.open(self.file_path, 'r')
|
188
|
+
self.background_task = nil
|
189
|
+
self.next_preload = []
|
190
|
+
self.file_eof = false
|
191
|
+
if options[:header]
|
192
|
+
self.has_header = true
|
193
|
+
if options[:header].kind_of?(Array)
|
194
|
+
self.header = options[:header]
|
195
|
+
end
|
196
|
+
# create attribute
|
197
|
+
self.header_generator
|
198
|
+
end
|
199
|
+
self.current = []
|
200
|
+
self.current_preload = []
|
201
|
+
self.query = Hash.new
|
202
|
+
if block_given?
|
203
|
+
self.each { |line, line_count| yield line, line_count }
|
204
|
+
else
|
205
|
+
self
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# return file name
|
210
|
+
def file_name
|
211
|
+
File.basename self.file_object
|
212
|
+
end
|
213
|
+
|
214
|
+
# return preload data, not a single line but preload data. Size of preload
|
215
|
+
# data is depend on preload attribute, default is 1000
|
216
|
+
#
|
217
|
+
# this method can be usefull for parallel processing
|
218
|
+
#
|
219
|
+
def each_preload(return_hash = self.has_header)
|
220
|
+
if self.query.empty?
|
221
|
+
if return_hash
|
222
|
+
while self.perform_preload
|
223
|
+
yield create_hash
|
224
|
+
end
|
225
|
+
else
|
226
|
+
while self.perform_preload
|
227
|
+
yield self.current_preload
|
228
|
+
end
|
229
|
+
end
|
230
|
+
else
|
231
|
+
if return_hash
|
232
|
+
while self.perform_filter
|
233
|
+
yield create_hash
|
234
|
+
end
|
235
|
+
else
|
236
|
+
while self.perform_filter
|
237
|
+
yield self.current_preload
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def create_hash
|
244
|
+
result_hash = []
|
245
|
+
self.current_preload.each do |preload|
|
246
|
+
result_hash << Hash[self.header.zip(preload)]
|
247
|
+
end
|
248
|
+
return result_hash
|
249
|
+
end
|
250
|
+
|
251
|
+
# set header, so you can call atribute based on header.
|
252
|
+
# Example:
|
253
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
254
|
+
# csv.set_header ['location_id', 'location_date', 'departure_date']
|
255
|
+
# csv.each do
|
256
|
+
# p csv.location_id
|
257
|
+
# p csv.location_date
|
258
|
+
# ... your code ...
|
259
|
+
# end
|
260
|
+
#
|
261
|
+
# Parameter must be an array, if not, will raise an ArgumentError
|
262
|
+
#
|
263
|
+
def set_header(header)
|
264
|
+
if header.kind_of?(Array)
|
265
|
+
# set header & has_header
|
266
|
+
self.header = header.map(&:downcase)
|
267
|
+
self.has_header = true
|
268
|
+
self.define_header_by_your_self = true
|
269
|
+
self.header_generator
|
270
|
+
else
|
271
|
+
raise ArgumentError, "header must be an array"
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
def get_header_index(header)
|
276
|
+
return header if header.kind_of?(Integer)
|
277
|
+
return self.header.index(header.to_s.downcase)
|
278
|
+
end
|
279
|
+
|
280
|
+
alias :define_header :set_header
|
281
|
+
|
282
|
+
# create atribute based on header.
|
283
|
+
# you don't have to call this method in your code.
|
284
|
+
def header_generator
|
285
|
+
if self.has_header
|
286
|
+
if self.header.kind_of?(Array)
|
287
|
+
header_line = self.header
|
288
|
+
else
|
289
|
+
self.header = CSV.parse(self.file_object.readline).first.map(&:downcase)
|
290
|
+
header_line = self.header
|
291
|
+
end
|
292
|
+
# create atribute based on csv header
|
293
|
+
header_line.each_with_index do |header, index|
|
294
|
+
self.define_singleton_method(header.downcase.gsub(/[^A-Za-z]/, '_')) do
|
295
|
+
return self.current[index]
|
296
|
+
end
|
297
|
+
end
|
298
|
+
self.define_singleton_method(:[]) do |key|
|
299
|
+
return self.current[header_line.index(key.downcase)]
|
300
|
+
end
|
301
|
+
return self.header
|
302
|
+
else
|
303
|
+
return false
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
# Loop through csv file.
|
308
|
+
#
|
309
|
+
# Example :
|
310
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
|
311
|
+
# csv.each do |line, line_count|
|
312
|
+
# p line[0]
|
313
|
+
# p line[1]
|
314
|
+
# ... your code ...
|
315
|
+
# end
|
316
|
+
#
|
317
|
+
# Example :
|
318
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
|
319
|
+
# csv.each do |line, line_count|
|
320
|
+
# csv.location_id
|
321
|
+
# csv.location_name
|
322
|
+
# ... your code ...
|
323
|
+
# end
|
324
|
+
#
|
325
|
+
def each
|
326
|
+
if self.has_header
|
327
|
+
while self.next
|
328
|
+
yield self, self.line_count
|
329
|
+
end
|
330
|
+
else
|
331
|
+
while self.next
|
332
|
+
yield self.current, self.line_count
|
333
|
+
end
|
334
|
+
end
|
335
|
+
end
|
336
|
+
|
337
|
+
# Get next line from CSV. This method actualy return data from preload variable,
|
338
|
+
# if preload empty this method will call `perform_preload` or `filter` -if
|
339
|
+
# query is not empty- to fill data to preload variable.
|
340
|
+
#
|
341
|
+
# After call this method, data will store on object (in `current` variable or
|
342
|
+
# in method with same name with header).
|
343
|
+
#
|
344
|
+
# Example :
|
345
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
|
346
|
+
# csv.next
|
347
|
+
# p csv.location_id => `return first line`
|
348
|
+
# csv.next
|
349
|
+
# p csv.location_id => `return second line`
|
350
|
+
#
|
351
|
+
# Example using while :
|
352
|
+
# csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
|
353
|
+
# while csv.next
|
354
|
+
# p csv.location_id
|
355
|
+
# end
|
356
|
+
#
|
357
|
+
def next
|
358
|
+
# if current_preload is empty, this method will call `perform_preload` or
|
359
|
+
# `filter` to fill current_preload with the data.
|
360
|
+
if self.current_preload.empty?
|
361
|
+
# if cursor reach end of file, then will return false. This is usefull if
|
362
|
+
# you call this method in `while`. See example above.
|
363
|
+
if self.file_object.eof?
|
364
|
+
self.current = []
|
365
|
+
return false
|
366
|
+
else
|
367
|
+
# determine which preload method will call, perform_preload which is get
|
368
|
+
# data without any filter.
|
369
|
+
#
|
370
|
+
# to perfom `filter preload` you must set filter in `where` method. See
|
371
|
+
# example in that method.
|
372
|
+
if self.query.empty?
|
373
|
+
self.perform_preload
|
374
|
+
else
|
375
|
+
self.perform_filter
|
376
|
+
end
|
377
|
+
end
|
378
|
+
end
|
379
|
+
# if preload method above return empty data, return false.
|
380
|
+
if !self.current_preload.empty?
|
381
|
+
# set current parsed line from first element in `current` attribute
|
382
|
+
self.current = self.current_preload.first
|
383
|
+
# delete first element in current_preload atribute
|
384
|
+
self.current_preload.shift
|
385
|
+
self.line_count += 1
|
386
|
+
return true
|
387
|
+
else
|
388
|
+
return false
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
# Get n line from csv file and parse. n is `preload` attribute. Default value
|
393
|
+
# for preload is 1000, you can change this value in this class constructor.
|
394
|
+
# See `initialize` method for more example.
|
395
|
+
#
|
396
|
+
# This method will return false if cursor has been reach end of line in csv
|
397
|
+
# file. Otherwise, return true.
|
398
|
+
#
|
399
|
+
def perform_preload
|
400
|
+
_preload
|
401
|
+
# if self.background_task.nil?
|
402
|
+
# if _preload
|
403
|
+
# _background_preload
|
404
|
+
# return true
|
405
|
+
# else
|
406
|
+
# return false
|
407
|
+
# end
|
408
|
+
# else
|
409
|
+
# return false if self.file_eof
|
410
|
+
# ThreadsWait.join(self.background_task)
|
411
|
+
# self.current_preload = self.next_preload
|
412
|
+
# _background_preload
|
413
|
+
# return true
|
414
|
+
# end
|
415
|
+
end
|
416
|
+
|
417
|
+
##
|
418
|
+
# Experimental
|
419
|
+
# Currently not working
|
420
|
+
#
|
421
|
+
# run preload on background
|
422
|
+
def _background_preload
|
423
|
+
mutex = Mutex.new
|
424
|
+
self.background_task = Thread.fork do
|
425
|
+
mutex.synchronize do
|
426
|
+
# for temporary data before parse to array
|
427
|
+
_row = String.new
|
428
|
+
for i in 1.upto(self.preload)
|
429
|
+
# if self.file_object.eof?
|
430
|
+
# parse last data
|
431
|
+
# break
|
432
|
+
# else
|
433
|
+
begin
|
434
|
+
# add line in file to temporary data
|
435
|
+
_row += self.file_object.readline
|
436
|
+
rescue EOFError => e
|
437
|
+
self.file_eof = true
|
438
|
+
break
|
439
|
+
end
|
440
|
+
# end
|
441
|
+
end
|
442
|
+
|
443
|
+
# parse data
|
444
|
+
begin
|
445
|
+
self.next_preload = CSV.parse(_row, self.parse_options)
|
446
|
+
rescue => e
|
447
|
+
debugger
|
448
|
+
raise e
|
449
|
+
end
|
450
|
+
end
|
451
|
+
Thread.current.exit
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
def _preload
|
456
|
+
return false if self.file_object.eof?
|
457
|
+
|
458
|
+
# for temporary data before parse to array
|
459
|
+
row = ''
|
460
|
+
for i in 1.upto(self.preload)
|
461
|
+
if self.file_object.eof?
|
462
|
+
# parse last data
|
463
|
+
# self.current_preload = CSV.parse(row, self.parse_options)
|
464
|
+
break
|
465
|
+
else
|
466
|
+
# add line in file to temporary data
|
467
|
+
row << self.file_object.readline
|
468
|
+
end
|
469
|
+
end
|
470
|
+
# parse data
|
471
|
+
self.current_preload = CSV.parse(row, self.parse_options)
|
472
|
+
return true
|
473
|
+
end
|
474
|
+
|
475
|
+
# Set filter. You can use this operator ['>', '<', '!', '%'] and Range to
|
476
|
+
# perform filter.
|
477
|
+
#
|
478
|
+
# Before set filter, you must set header true or define header by yourself.
|
479
|
+
# For set header true and define header, see example above.
|
480
|
+
#
|
481
|
+
# Example:
|
482
|
+
# CSV data:
|
483
|
+
# ___________________________
|
484
|
+
# |id | name | birth |
|
485
|
+
# |1 | shania | 27-06-1998|
|
486
|
+
# |2 | jessica | 19-08-1993|
|
487
|
+
# |3 | michelle| 28-10-1999|
|
488
|
+
# |___|__________|___________|
|
489
|
+
#
|
490
|
+
# - Equal.
|
491
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
492
|
+
# csv.where(name: 'michelle')
|
493
|
+
# ... use csv.each or while csv.next ...
|
494
|
+
# => will return [3, 'michelle', '28-10-1999']
|
495
|
+
#
|
496
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
497
|
+
# csv.where(name: ['shania', 'jessica'])
|
498
|
+
# ... use csv.each or while csv.next ...
|
499
|
+
# => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
|
500
|
+
#
|
501
|
+
# - Range. Only for Date, Integer and Float data type. Define datatype in
|
502
|
+
# first range. Use `integer` for Integer & Float, use `date` for Date.
|
503
|
+
# See Example below.
|
504
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
505
|
+
# csv.where(id: '(integer)1'..'2')
|
506
|
+
# ... use csv.each or while csv.next ...
|
507
|
+
# => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
|
508
|
+
#
|
509
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
510
|
+
# csv.where(birth: '(date)01-01-1993'..'01-01-1999')
|
511
|
+
# ... use csv.each or while csv.next ...
|
512
|
+
# => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993'], [3, 'michelle', '28-10-1999']]
|
513
|
+
#
|
514
|
+
# - Operator '>' & '<'. Only for column with data type Integer, Float or Date
|
515
|
+
# Like `id` or `birth` in example csv data above.
|
516
|
+
# - Data type must defined in filter, use `integer` for Integer or Float
|
517
|
+
# and use `date` for Date. Put operator & data type together without
|
518
|
+
# space. See example below.
|
519
|
+
# - For filter with Date data type (in csv or in filter), any value that
|
520
|
+
# can be parse using `Date.parse` are acceptable.
|
521
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
522
|
+
# csv.where(id: '>(integer)1')
|
523
|
+
# ... use csv.each or while csv.next ...
|
524
|
+
# => will return [[2, 'jessica', '19-08-1993'], [1, 'michelle', '28-10-1999']]
|
525
|
+
#
|
526
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
527
|
+
# csv.where(birth: '<(date)01-01-1997')
|
528
|
+
# ... use csv.each or while csv.next ...
|
529
|
+
# => will return [2, 'jessica', '19-08-1993']
|
530
|
+
#
|
531
|
+
# - Operator '!'. Put this operator in first character
|
532
|
+
# and folow with query without space.
|
533
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
534
|
+
# csv.where(name: '!michelle')
|
535
|
+
# ... use csv.each or while csv.next ...
|
536
|
+
# => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
|
537
|
+
#
|
538
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
539
|
+
# csv.where(name: ['!shania', '!jessica'])
|
540
|
+
# ... use csv.each or while csv.next ...
|
541
|
+
# => will return [3, 'michelle', '28-10-1999']
|
542
|
+
#
|
543
|
+
# - Operator '%'. `Like` Operator. Put this operator in first character and
|
544
|
+
# folow with query without space.
|
545
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
546
|
+
# csv.where(name: '%jes')
|
547
|
+
# ... use csv.each or while csv.next ...
|
548
|
+
# => will return [2, 'jessica', '19-08-1993']
|
549
|
+
#
|
550
|
+
# * csv = ImportCSV.new('member.csv'), header: true)
|
551
|
+
# csv.where(name: ['%jes', '%shan'])
|
552
|
+
# ... use csv.each or while csv.next ...
|
553
|
+
# => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
|
554
|
+
#
|
555
|
+
# Note :
|
556
|
+
# - Data type must define if you use `<` or `>` or in Range. Don't define
|
557
|
+
# data type in other operator.
|
558
|
+
#
|
559
|
+
def where(query = Hash.new)
|
560
|
+
# if !self.has_header
|
561
|
+
# raise ArgumentError, 'Header not detected.'
|
562
|
+
# end
|
563
|
+
|
564
|
+
query.each do |key, values|
|
565
|
+
if values.kind_of?(Range)
|
566
|
+
if values.first.kind_of?(String) || values.last.kind_of?(String)
|
567
|
+
raise ArgumentError, "Range filter only accept Date, Time, Integer or Float data type."
|
568
|
+
end
|
569
|
+
|
570
|
+
if values.first > values.last
|
571
|
+
raise ArgumentError, "First value is larger than last value."
|
572
|
+
end
|
573
|
+
end
|
574
|
+
end
|
575
|
+
|
576
|
+
self.query = self.query.merge(query)
|
577
|
+
# for chaining
|
578
|
+
self
|
579
|
+
end
|
580
|
+
|
581
|
+
def clear_filter
|
582
|
+
self.query = Hash.new
|
583
|
+
self
|
584
|
+
end
|
585
|
+
|
586
|
+
# Perform preload with filter data. Call `where` with query first before call
|
587
|
+
# this method.
|
588
|
+
#
|
589
|
+
def perform_filter(query = self.query)
|
590
|
+
# list of operator that can be used.
|
591
|
+
filter_operation = ['>', '<', '!', '%']
|
592
|
+
col_sep = self.parse_options[:col_sep] || ','
|
593
|
+
row_sep = self.parse_options[:row_sep] || "\n"
|
594
|
+
# temporary preload data.
|
595
|
+
row = []
|
596
|
+
# first loop to make sure temporary preload size is same as defined preload
|
597
|
+
# size
|
598
|
+
loop do
|
599
|
+
row_tmp = []
|
600
|
+
# preload data
|
601
|
+
for i in 1.upto(self.preload)
|
602
|
+
if self.file_object.eof?
|
603
|
+
# break the loop if reach end of line
|
604
|
+
break
|
605
|
+
else
|
606
|
+
# get line and split to array elament by column separator,
|
607
|
+
_row_tmp = self.file_object.readline.split(col_sep)
|
608
|
+
is_insert = false
|
609
|
+
# loop throug query difined in `where`
|
610
|
+
query.each do |key, values|
|
611
|
+
# if value is Range, (integer)1..2 or (date)01-01-2015..01-01-2016
|
612
|
+
if values.kind_of?(Range)
|
613
|
+
# scan for data type insert brackets.
|
614
|
+
# if values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "date"
|
615
|
+
if values.first.kind_of?(Date) || values.first.kind_of?(Time)
|
616
|
+
# tmp_range_first = Date.parse(values.first[6..values.first.size])
|
617
|
+
tmp_range_first = values.first
|
618
|
+
tmp_range_last = values.last
|
619
|
+
# remove quote \" and `new line` from string
|
620
|
+
tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, ''))
|
621
|
+
# elsif values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer"
|
622
|
+
elsif values.first.kind_of?(Integer) || values.first.kind_of?(Float)
|
623
|
+
# value with type integer will convert to float
|
624
|
+
# tmp_range_first = (values.first[9..values.first.size]).to_f
|
625
|
+
tmp_range_first = (values.first).to_f
|
626
|
+
tmp_range_last = (values.last).to_f
|
627
|
+
tmp_value = (_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '')).to_f
|
628
|
+
end
|
629
|
+
|
630
|
+
# comparation
|
631
|
+
if (tmp_value >= tmp_range_first) && (tmp_value <= tmp_range_last)
|
632
|
+
is_insert = true
|
633
|
+
break
|
634
|
+
else
|
635
|
+
is_insert = false
|
636
|
+
end
|
637
|
+
else
|
638
|
+
# if value from query is not array, then will be conver to array
|
639
|
+
# with only one element
|
640
|
+
if !values.kind_of?(Array)
|
641
|
+
values = [values]
|
642
|
+
end
|
643
|
+
|
644
|
+
#loop throug value from query
|
645
|
+
values.each do |value|
|
646
|
+
# check operator from first caracter in value, is any operator
|
647
|
+
# define or not. if not, then will goto `equal`
|
648
|
+
if filter_operation.include?(value[0])
|
649
|
+
if value[0] === '>' || value[0] === '<'
|
650
|
+
# scan for data type insert brackets.
|
651
|
+
if value.scan(/\(([^\)]+)\)/)[0][0].downcase == "date"
|
652
|
+
tmp_filter = Date.parse(value[7..value.size])
|
653
|
+
# remove quote \" and `new line` from string
|
654
|
+
tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, ''))
|
655
|
+
elsif value.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer"
|
656
|
+
tmp_filter = value[10..value.size].to_f
|
657
|
+
tmp_value = _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').to_f
|
658
|
+
end
|
659
|
+
|
660
|
+
# comparation
|
661
|
+
if value[0] === '>'
|
662
|
+
if tmp_value > tmp_filter
|
663
|
+
is_insert = true
|
664
|
+
break
|
665
|
+
else
|
666
|
+
is_insert = false
|
667
|
+
end
|
668
|
+
elsif value[0] === '<'
|
669
|
+
if tmp_value < tmp_filter
|
670
|
+
is_insert = true
|
671
|
+
break
|
672
|
+
else
|
673
|
+
is_insert = false
|
674
|
+
end
|
675
|
+
end
|
676
|
+
|
677
|
+
elsif value[0] === '!'
|
678
|
+
# remove quote \" and `new line` from string
|
679
|
+
if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') != value[1..value.size]
|
680
|
+
is_insert = true
|
681
|
+
else
|
682
|
+
is_insert = false
|
683
|
+
break
|
684
|
+
end
|
685
|
+
elsif value[0] === '%'
|
686
|
+
# remove quote \" and `new line` from string
|
687
|
+
if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').include?(value[1..value.size])
|
688
|
+
is_insert = true
|
689
|
+
break
|
690
|
+
else
|
691
|
+
is_insert = false
|
692
|
+
end
|
693
|
+
|
694
|
+
else
|
695
|
+
# raise an ArgumentError (Exception) if opertor is not one
|
696
|
+
# of which has been defined
|
697
|
+
raise ArgumentError, 'Operator not allowed. Use one of this [>, <, !, %].'
|
698
|
+
end
|
699
|
+
else
|
700
|
+
# remove quote \" and `new line` from string
|
701
|
+
if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') === (value)
|
702
|
+
is_insert = true
|
703
|
+
break
|
704
|
+
else
|
705
|
+
is_insert = false
|
706
|
+
end
|
707
|
+
end
|
708
|
+
end
|
709
|
+
end
|
710
|
+
# go to next line if query return false
|
711
|
+
break if !is_insert
|
712
|
+
end
|
713
|
+
# insert to temporary accepted row if all query return true
|
714
|
+
row_tmp << _row_tmp.join(col_sep) if is_insert
|
715
|
+
end
|
716
|
+
end
|
717
|
+
row.push(*row_tmp) if row_tmp.size > 0
|
718
|
+
break if row.size >= self.preload || self.file_object.eof?
|
719
|
+
end
|
720
|
+
self.current_preload = CSV.parse(row.join(col_sep).gsub("#{row_sep},", row_sep), self.parse_options)
|
721
|
+
end
|
722
|
+
|
723
|
+
# if object from this class will reuse, call this method to reopen file so you
|
724
|
+
# can read file again.
|
725
|
+
def reopen
|
726
|
+
begin
|
727
|
+
self.file_object = File.open(self.file_path, 'r')
|
728
|
+
self.line_count = 1
|
729
|
+
self.current = []
|
730
|
+
self.current_preload = []
|
731
|
+
self.file_object.readline if self.has_header && !self.define_header_by_your_self
|
732
|
+
true
|
733
|
+
rescue => e
|
734
|
+
raise e
|
735
|
+
end
|
736
|
+
end
|
737
|
+
|
738
|
+
def self.export(file)
|
739
|
+
if block_given?
|
740
|
+
File.open(file, 'w')
|
741
|
+
File.open(file, 'a') do |file|
|
742
|
+
yield file
|
743
|
+
end
|
744
|
+
else
|
745
|
+
raise NotImplementedError, "block must given."
|
746
|
+
end
|
747
|
+
end
|
748
|
+
end
|
metadata
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: import_csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kasyfil Aziz Tri Cahyana
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-07-12 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Faster & most memory efficient to import large csv file in Ruby.
|
14
|
+
email: tricahyana@windowslive.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- lib/import_csv.rb
|
20
|
+
homepage: http://rubygems.org/gems/import_csv
|
21
|
+
licenses:
|
22
|
+
- MIT
|
23
|
+
metadata: {}
|
24
|
+
post_install_message:
|
25
|
+
rdoc_options: []
|
26
|
+
require_paths:
|
27
|
+
- lib
|
28
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
requirements: []
|
39
|
+
rubyforge_project:
|
40
|
+
rubygems_version: 2.5.1
|
41
|
+
signing_key:
|
42
|
+
specification_version: 4
|
43
|
+
summary: Import CSV
|
44
|
+
test_files: []
|