import_csv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/import_csv.rb +748 -0
  3. metadata +44 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4b331c5ae2a5fd009f9679efbb6d5b69c52c3555
4
+ data.tar.gz: 2cf6adef2752499a5e013db244319e31cb97a655
5
+ SHA512:
6
+ metadata.gz: fcd5b98e5261cded35cab1d87fd6b487c5b8bcd93bc693c7a1840ab6a225793cf7a93be1a04b5b0fada631d5de275380c239507bafc8623721ca1a63b3269692
7
+ data.tar.gz: b7b13d3b3ddc5c8c2868ecf92268da30c3ef5bb3dd6867f7fb60a9505511a8922d20d9ec9dbddcfa78edb1e3314bbc3b009223747cdcdff4f197dd3bc79a75f5
data/lib/import_csv.rb ADDED
@@ -0,0 +1,748 @@
1
+ # Kasyfil Aziz Tri Cahyana <tricahyana@windowslive.com> <kasyfil.aziz@wgs.co.id> 2016
2
+ #
3
+ # require all file below in yours Ruby on Rails Application
4
+ # lib/import_csv/import.rb
5
+ #
6
+ # This library can make import data from large csv (>10M) faster and very low
7
+ # memory usage (depend on preload data setting).
8
+ #
9
+ # This library will get data line by line and parse to array, it's using CSV.parse
10
+ # but more efficient. Rather than parse line by line, this library will load some
11
+ # line to memory first and parse to array. It called preload data. Default
12
+ # preload data is 1000, but you can define preload data by your own. You can also
13
+ # configure parse option like CSV parse option in standart ruby library.
14
+ #
15
+ # Example :
16
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
17
+ # while csv.next
18
+ # p csv.current[0]
19
+ # p csv.current[1]
20
+ # ... your code ...
21
+ # end
22
+ #
23
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
24
+ # csv.each do |line|
25
+ # p line[0]
26
+ # p line[1]
27
+ # ... your code ...
28
+ # end
29
+ #
30
+ # ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv')) do |line|
31
+ # p line[0]
32
+ # p line[1]
33
+ # ... your code ...
34
+ # end
35
+ #
36
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true) do |line|
37
+ # p line['location_id']
38
+ # p line['location_name']
39
+ # ... your code ...
40
+ # end
41
+ #
42
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
43
+ # while csv.next
44
+ # p csv.location_id
45
+ # p csv.location_name
46
+ # ... your code ...
47
+ # end
48
+ #
49
+ # - Setting preload data. You can setting preload data by set preload attribute
50
+ # or in ImportCSV.new
51
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
52
+ # csv.preload = 2000
53
+ # csv.each do |line|
54
+ # p line[0]
55
+ # p line[1]
56
+ # ... your code ...
57
+ # end
58
+ #
59
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), preload: 2000)
60
+ # csv.each do |line|
61
+ # p line[0]
62
+ # p line[1]
63
+ # ... your code ...
64
+ # end
65
+ #
66
+ # - Setting Automatic header, get header from first row in file and ignore
67
+ # first row.
68
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true, preload: 2000)
69
+ # csv.each do
70
+ # p csv.loation_id
71
+ # p csv.location_name
72
+ # ... your code ...
73
+ # end
74
+ #
75
+ # - Define header by your self. If you define header your self, this script
76
+ # will not ignore first row in file.
77
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: ['location_id', 'location_name'])
78
+ # csv.each do
79
+ # p csv.location_id
80
+ # p csv.location_name
81
+ # ... your code ...
82
+ # end
83
+ #
84
+ # - Define header by instance method.
85
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
86
+ # csv.set_header ['location_id', 'location_name']
87
+ # csv.each do
88
+ # p csv.location_id
89
+ # p csv.location_name
90
+ # ... your code ...
91
+ # end
92
+ #
93
+ # - Call data using hash. Header must be set.
94
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
95
+ # csv.set_header ['location_id', 'location_name']
96
+ # csv.each do
97
+ # p csv['location_id']
98
+ # p csv['location_name']
99
+ # ... your code ...
100
+ # end
101
+ #
102
+ # - Setting parse option. (Read: http://ruby-doc.org/stdlib-2.0.0/libdoc/csv/rdoc/CSV.html#class-CSV-label-CSV+and+Character+Encodings+-28M17n+or+Multilingualization-29)
103
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true, parse_options: {col_sep: ';', quote_char: '"'})
104
+ # csv.each do |line|
105
+ # p line['location_id']
106
+ # p line['location_name']
107
+ # ... your code ...
108
+ # end
109
+ #
110
+ # - Pararel processing. Instead loop through preload data, each method can pass
111
+ # array of preload data so you can send to background process like Sidekiq.
112
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true, return_preload_only: true)
113
+ # csv.each_preload do |preload_data|
114
+ # CsvWorker.perform_async(preload_data)
115
+ # end
116
+ #
117
+ # - Filter data. More example, see documenttation in `where` mothode below.
118
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
119
+ # csv.where(location_name: 'jakarta')
120
+ # csv.each do |line|
121
+ # p line.location_id
122
+ # p line.location_name
123
+ # ... your code ...
124
+ # end
125
+ #
126
+
127
+ require 'csv'
128
+
129
+ class ImportCSV
130
+ # preload data
131
+ attr_accessor :preload
132
+
133
+ # current line number in file
134
+ attr_accessor :line_count
135
+
136
+ # file path (string)
137
+ attr_accessor :file_path
138
+
139
+ # file object (File)
140
+ attr_accessor :file_object
141
+
142
+ # header (Array)
143
+ attr_accessor :header
144
+
145
+ # header (Boolean)
146
+ attr_accessor :has_header
147
+ attr_accessor :define_header_by_your_self
148
+
149
+ # current line in csv file if header has been define
150
+ attr_accessor :current
151
+
152
+ # current preload data
153
+ attr_accessor :current_preload
154
+
155
+ # Boolean. If true, will loop through file and send current preload data to
156
+ # block function
157
+ attr_accessor :return_preload_only
158
+
159
+ # set parse options
160
+ attr_accessor :parse_options
161
+
162
+ attr_accessor :query
163
+
164
+ attr_accessor :background_task
165
+
166
+ attr_accessor :next_preload
167
+
168
+ attr_accessor :file_eof
169
+
170
+ # Class constructor.
171
+ # set file path and preload data
172
+ # if block given, then will call `each` so you can add block line using `each`
173
+ #
174
+ # Example :
175
+ # ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv')) do |line|
176
+ # p line.current[0]
177
+ # p line.current[1]
178
+ # ... your code ...
179
+ # end
180
+ #
181
+ def initialize(file_path, options = Hash.new)
182
+ self.preload = options[:preload] || 1000
183
+ self.line_count = 1
184
+ self.file_path = file_path
185
+ self.parse_options = options[:parse_options] || {}
186
+ self.return_preload_only = options[:return_preload_only] || false
187
+ self.file_object = File.open(self.file_path, 'r')
188
+ self.background_task = nil
189
+ self.next_preload = []
190
+ self.file_eof = false
191
+ if options[:header]
192
+ self.has_header = true
193
+ if options[:header].kind_of?(Array)
194
+ self.header = options[:header]
195
+ end
196
+ # create attribute
197
+ self.header_generator
198
+ end
199
+ self.current = []
200
+ self.current_preload = []
201
+ self.query = Hash.new
202
+ if block_given?
203
+ self.each { |line, line_count| yield line, line_count }
204
+ else
205
+ self
206
+ end
207
+ end
208
+
209
+ # return file name
210
+ def file_name
211
+ File.basename self.file_object
212
+ end
213
+
214
+ # return preload data, not a single line but preload data. Size of preload
215
+ # data is depend on preload attribute, default is 1000
216
+ #
217
+ # this method can be usefull for parallel processing
218
+ #
219
+ def each_preload(return_hash = self.has_header)
220
+ if self.query.empty?
221
+ if return_hash
222
+ while self.perform_preload
223
+ yield create_hash
224
+ end
225
+ else
226
+ while self.perform_preload
227
+ yield self.current_preload
228
+ end
229
+ end
230
+ else
231
+ if return_hash
232
+ while self.perform_filter
233
+ yield create_hash
234
+ end
235
+ else
236
+ while self.perform_filter
237
+ yield self.current_preload
238
+ end
239
+ end
240
+ end
241
+ end
242
+
243
+ def create_hash
244
+ result_hash = []
245
+ self.current_preload.each do |preload|
246
+ result_hash << Hash[self.header.zip(preload)]
247
+ end
248
+ return result_hash
249
+ end
250
+
251
+ # set header, so you can call atribute based on header.
252
+ # Example:
253
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
254
+ # csv.set_header ['location_id', 'location_date', 'departure_date']
255
+ # csv.each do
256
+ # p csv.location_id
257
+ # p csv.location_date
258
+ # ... your code ...
259
+ # end
260
+ #
261
+ # Parameter must be an array, if not, will raise an ArgumentError
262
+ #
263
+ def set_header(header)
264
+ if header.kind_of?(Array)
265
+ # set header & has_header
266
+ self.header = header.map(&:downcase)
267
+ self.has_header = true
268
+ self.define_header_by_your_self = true
269
+ self.header_generator
270
+ else
271
+ raise ArgumentError, "header must be an array"
272
+ end
273
+ end
274
+
275
+ def get_header_index(header)
276
+ return header if header.kind_of?(Integer)
277
+ return self.header.index(header.to_s.downcase)
278
+ end
279
+
280
+ alias :define_header :set_header
281
+
282
+ # create atribute based on header.
283
+ # you don't have to call this method in your code.
284
+ def header_generator
285
+ if self.has_header
286
+ if self.header.kind_of?(Array)
287
+ header_line = self.header
288
+ else
289
+ self.header = CSV.parse(self.file_object.readline).first.map(&:downcase)
290
+ header_line = self.header
291
+ end
292
+ # create atribute based on csv header
293
+ header_line.each_with_index do |header, index|
294
+ self.define_singleton_method(header.downcase.gsub(/[^A-Za-z]/, '_')) do
295
+ return self.current[index]
296
+ end
297
+ end
298
+ self.define_singleton_method(:[]) do |key|
299
+ return self.current[header_line.index(key.downcase)]
300
+ end
301
+ return self.header
302
+ else
303
+ return false
304
+ end
305
+ end
306
+
307
+ # Loop through csv file.
308
+ #
309
+ # Example :
310
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'))
311
+ # csv.each do |line, line_count|
312
+ # p line[0]
313
+ # p line[1]
314
+ # ... your code ...
315
+ # end
316
+ #
317
+ # Example :
318
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
319
+ # csv.each do |line, line_count|
320
+ # csv.location_id
321
+ # csv.location_name
322
+ # ... your code ...
323
+ # end
324
+ #
325
+ def each
326
+ if self.has_header
327
+ while self.next
328
+ yield self, self.line_count
329
+ end
330
+ else
331
+ while self.next
332
+ yield self.current, self.line_count
333
+ end
334
+ end
335
+ end
336
+
337
+ # Get next line from CSV. This method actualy return data from preload variable,
338
+ # if preload empty this method will call `perform_preload` or `filter` -if
339
+ # query is not empty- to fill data to preload variable.
340
+ #
341
+ # After call this method, data will store on object (in `current` variable or
342
+ # in method with same name with header).
343
+ #
344
+ # Example :
345
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
346
+ # csv.next
347
+ # p csv.location_id => `return first line`
348
+ # csv.next
349
+ # p csv.location_id => `return second line`
350
+ #
351
+ # Example using while :
352
+ # csv = ImportCSV.new(Rails.root.join('db/seeds/development/tx_locations.csv'), header: true)
353
+ # while csv.next
354
+ # p csv.location_id
355
+ # end
356
+ #
357
+ def next
358
+ # if current_preload is empty, this method will call `perform_preload` or
359
+ # `filter` to fill current_preload with the data.
360
+ if self.current_preload.empty?
361
+ # if cursor reach end of file, then will return false. This is usefull if
362
+ # you call this method in `while`. See example above.
363
+ if self.file_object.eof?
364
+ self.current = []
365
+ return false
366
+ else
367
+ # determine which preload method will call, perform_preload which is get
368
+ # data without any filter.
369
+ #
370
+ # to perfom `filter preload` you must set filter in `where` method. See
371
+ # example in that method.
372
+ if self.query.empty?
373
+ self.perform_preload
374
+ else
375
+ self.perform_filter
376
+ end
377
+ end
378
+ end
379
+ # if preload method above return empty data, return false.
380
+ if !self.current_preload.empty?
381
+ # set current parsed line from first element in `current` attribute
382
+ self.current = self.current_preload.first
383
+ # delete first element in current_preload atribute
384
+ self.current_preload.shift
385
+ self.line_count += 1
386
+ return true
387
+ else
388
+ return false
389
+ end
390
+ end
391
+
392
+ # Get n line from csv file and parse. n is `preload` attribute. Default value
393
+ # for preload is 1000, you can change this value in this class constructor.
394
+ # See `initialize` method for more example.
395
+ #
396
+ # This method will return false if cursor has been reach end of line in csv
397
+ # file. Otherwise, return true.
398
+ #
399
+ def perform_preload
400
+ _preload
401
+ # if self.background_task.nil?
402
+ # if _preload
403
+ # _background_preload
404
+ # return true
405
+ # else
406
+ # return false
407
+ # end
408
+ # else
409
+ # return false if self.file_eof
410
+ # ThreadsWait.join(self.background_task)
411
+ # self.current_preload = self.next_preload
412
+ # _background_preload
413
+ # return true
414
+ # end
415
+ end
416
+
417
+ ##
418
+ # Experimental
419
+ # Currently not working
420
+ #
421
+ # run preload on background
422
+ def _background_preload
423
+ mutex = Mutex.new
424
+ self.background_task = Thread.fork do
425
+ mutex.synchronize do
426
+ # for temporary data before parse to array
427
+ _row = String.new
428
+ for i in 1.upto(self.preload)
429
+ # if self.file_object.eof?
430
+ # parse last data
431
+ # break
432
+ # else
433
+ begin
434
+ # add line in file to temporary data
435
+ _row += self.file_object.readline
436
+ rescue EOFError => e
437
+ self.file_eof = true
438
+ break
439
+ end
440
+ # end
441
+ end
442
+
443
+ # parse data
444
+ begin
445
+ self.next_preload = CSV.parse(_row, self.parse_options)
446
+ rescue => e
447
+ debugger
448
+ raise e
449
+ end
450
+ end
451
+ Thread.current.exit
452
+ end
453
+ end
454
+
455
+ def _preload
456
+ return false if self.file_object.eof?
457
+
458
+ # for temporary data before parse to array
459
+ row = ''
460
+ for i in 1.upto(self.preload)
461
+ if self.file_object.eof?
462
+ # parse last data
463
+ # self.current_preload = CSV.parse(row, self.parse_options)
464
+ break
465
+ else
466
+ # add line in file to temporary data
467
+ row << self.file_object.readline
468
+ end
469
+ end
470
+ # parse data
471
+ self.current_preload = CSV.parse(row, self.parse_options)
472
+ return true
473
+ end
474
+
475
+ # Set filter. You can use this operator ['>', '<', '!', '%'] and Range to
476
+ # perform filter.
477
+ #
478
+ # Before set filter, you must set header true or define header by yourself.
479
+ # For set header true and define header, see example above.
480
+ #
481
+ # Example:
482
+ # CSV data:
483
+ # ___________________________
484
+ # |id | name | birth |
485
+ # |1 | shania | 27-06-1998|
486
+ # |2 | jessica | 19-08-1993|
487
+ # |3 | michelle| 28-10-1999|
488
+ # |___|__________|___________|
489
+ #
490
+ # - Equal.
491
+ # * csv = ImportCSV.new('member.csv'), header: true)
492
+ # csv.where(name: 'michelle')
493
+ # ... use csv.each or while csv.next ...
494
+ # => will return [3, 'michelle', '28-10-1999']
495
+ #
496
+ # * csv = ImportCSV.new('member.csv'), header: true)
497
+ # csv.where(name: ['shania', 'jessica'])
498
+ # ... use csv.each or while csv.next ...
499
+ # => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
500
+ #
501
+ # - Range. Only for Date, Integer and Float data type. Define datatype in
502
+ # first range. Use `integer` for Integer & Float, use `date` for Date.
503
+ # See Example below.
504
+ # * csv = ImportCSV.new('member.csv'), header: true)
505
+ # csv.where(id: '(integer)1'..'2')
506
+ # ... use csv.each or while csv.next ...
507
+ # => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
508
+ #
509
+ # * csv = ImportCSV.new('member.csv'), header: true)
510
+ # csv.where(birth: '(date)01-01-1993'..'01-01-1999')
511
+ # ... use csv.each or while csv.next ...
512
+ # => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993'], [3, 'michelle', '28-10-1999']]
513
+ #
514
+ # - Operator '>' & '<'. Only for column with data type Integer, Float or Date
515
+ # Like `id` or `birth` in example csv data above.
516
+ # - Data type must defined in filter, use `integer` for Integer or Float
517
+ # and use `date` for Date. Put operator & data type together without
518
+ # space. See example below.
519
+ # - For filter with Date data type (in csv or in filter), any value that
520
+ # can be parse using `Date.parse` are acceptable.
521
+ # * csv = ImportCSV.new('member.csv'), header: true)
522
+ # csv.where(id: '>(integer)1')
523
+ # ... use csv.each or while csv.next ...
524
+ # => will return [[2, 'jessica', '19-08-1993'], [1, 'michelle', '28-10-1999']]
525
+ #
526
+ # * csv = ImportCSV.new('member.csv'), header: true)
527
+ # csv.where(birth: '<(date)01-01-1997')
528
+ # ... use csv.each or while csv.next ...
529
+ # => will return [2, 'jessica', '19-08-1993']
530
+ #
531
+ # - Operator '!'. Put this operator in first character
532
+ # and folow with query without space.
533
+ # * csv = ImportCSV.new('member.csv'), header: true)
534
+ # csv.where(name: '!michelle')
535
+ # ... use csv.each or while csv.next ...
536
+ # => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
537
+ #
538
+ # * csv = ImportCSV.new('member.csv'), header: true)
539
+ # csv.where(name: ['!shania', '!jessica'])
540
+ # ... use csv.each or while csv.next ...
541
+ # => will return [3, 'michelle', '28-10-1999']
542
+ #
543
+ # - Operator '%'. `Like` Operator. Put this operator in first character and
544
+ # folow with query without space.
545
+ # * csv = ImportCSV.new('member.csv'), header: true)
546
+ # csv.where(name: '%jes')
547
+ # ... use csv.each or while csv.next ...
548
+ # => will return [2, 'jessica', '19-08-1993']
549
+ #
550
+ # * csv = ImportCSV.new('member.csv'), header: true)
551
+ # csv.where(name: ['%jes', '%shan'])
552
+ # ... use csv.each or while csv.next ...
553
+ # => will return [[1, 'shania', '27-06-1998'], [2, 'jessica', '19-08-1993']]
554
+ #
555
+ # Note :
556
+ # - Data type must define if you use `<` or `>` or in Range. Don't define
557
+ # data type in other operator.
558
+ #
559
+ def where(query = Hash.new)
560
+ # if !self.has_header
561
+ # raise ArgumentError, 'Header not detected.'
562
+ # end
563
+
564
+ query.each do |key, values|
565
+ if values.kind_of?(Range)
566
+ if values.first.kind_of?(String) || values.last.kind_of?(String)
567
+ raise ArgumentError, "Range filter only accept Date, Time, Integer or Float data type."
568
+ end
569
+
570
+ if values.first > values.last
571
+ raise ArgumentError, "First value is larger than last value."
572
+ end
573
+ end
574
+ end
575
+
576
+ self.query = self.query.merge(query)
577
+ # for chaining
578
+ self
579
+ end
580
+
581
+ def clear_filter
582
+ self.query = Hash.new
583
+ self
584
+ end
585
+
586
+ # Perform preload with filter data. Call `where` with query first before call
587
+ # this method.
588
+ #
589
+ def perform_filter(query = self.query)
590
+ # list of operator that can be used.
591
+ filter_operation = ['>', '<', '!', '%']
592
+ col_sep = self.parse_options[:col_sep] || ','
593
+ row_sep = self.parse_options[:row_sep] || "\n"
594
+ # temporary preload data.
595
+ row = []
596
+ # first loop to make sure temporary preload size is same as defined preload
597
+ # size
598
+ loop do
599
+ row_tmp = []
600
+ # preload data
601
+ for i in 1.upto(self.preload)
602
+ if self.file_object.eof?
603
+ # break the loop if reach end of line
604
+ break
605
+ else
606
+ # get line and split to array elament by column separator,
607
+ _row_tmp = self.file_object.readline.split(col_sep)
608
+ is_insert = false
609
+ # loop throug query difined in `where`
610
+ query.each do |key, values|
611
+ # if value is Range, (integer)1..2 or (date)01-01-2015..01-01-2016
612
+ if values.kind_of?(Range)
613
+ # scan for data type insert brackets.
614
+ # if values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "date"
615
+ if values.first.kind_of?(Date) || values.first.kind_of?(Time)
616
+ # tmp_range_first = Date.parse(values.first[6..values.first.size])
617
+ tmp_range_first = values.first
618
+ tmp_range_last = values.last
619
+ # remove quote \" and `new line` from string
620
+ tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, ''))
621
+ # elsif values.first.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer"
622
+ elsif values.first.kind_of?(Integer) || values.first.kind_of?(Float)
623
+ # value with type integer will convert to float
624
+ # tmp_range_first = (values.first[9..values.first.size]).to_f
625
+ tmp_range_first = (values.first).to_f
626
+ tmp_range_last = (values.last).to_f
627
+ tmp_value = (_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '')).to_f
628
+ end
629
+
630
+ # comparation
631
+ if (tmp_value >= tmp_range_first) && (tmp_value <= tmp_range_last)
632
+ is_insert = true
633
+ break
634
+ else
635
+ is_insert = false
636
+ end
637
+ else
638
+ # if value from query is not array, then will be conver to array
639
+ # with only one element
640
+ if !values.kind_of?(Array)
641
+ values = [values]
642
+ end
643
+
644
+ #loop throug value from query
645
+ values.each do |value|
646
+ # check operator from first caracter in value, is any operator
647
+ # define or not. if not, then will goto `equal`
648
+ if filter_operation.include?(value[0])
649
+ if value[0] === '>' || value[0] === '<'
650
+ # scan for data type insert brackets.
651
+ if value.scan(/\(([^\)]+)\)/)[0][0].downcase == "date"
652
+ tmp_filter = Date.parse(value[7..value.size])
653
+ # remove quote \" and `new line` from string
654
+ tmp_value = Date.parse(_row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, ''))
655
+ elsif value.scan(/\(([^\)]+)\)/)[0][0].downcase == "integer"
656
+ tmp_filter = value[10..value.size].to_f
657
+ tmp_value = _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').to_f
658
+ end
659
+
660
+ # comparation
661
+ if value[0] === '>'
662
+ if tmp_value > tmp_filter
663
+ is_insert = true
664
+ break
665
+ else
666
+ is_insert = false
667
+ end
668
+ elsif value[0] === '<'
669
+ if tmp_value < tmp_filter
670
+ is_insert = true
671
+ break
672
+ else
673
+ is_insert = false
674
+ end
675
+ end
676
+
677
+ elsif value[0] === '!'
678
+ # remove quote \" and `new line` from string
679
+ if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') != value[1..value.size]
680
+ is_insert = true
681
+ else
682
+ is_insert = false
683
+ break
684
+ end
685
+ elsif value[0] === '%'
686
+ # remove quote \" and `new line` from string
687
+ if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '').include?(value[1..value.size])
688
+ is_insert = true
689
+ break
690
+ else
691
+ is_insert = false
692
+ end
693
+
694
+ else
695
+ # raise an ArgumentError (Exception) if opertor is not one
696
+ # of which has been defined
697
+ raise ArgumentError, 'Operator not allowed. Use one of this [>, <, !, %].'
698
+ end
699
+ else
700
+ # remove quote \" and `new line` from string
701
+ if _row_tmp[self.get_header_index(key)].gsub(/\A"|"\Z/, '').gsub(row_sep, '') === (value)
702
+ is_insert = true
703
+ break
704
+ else
705
+ is_insert = false
706
+ end
707
+ end
708
+ end
709
+ end
710
+ # go to next line if query return false
711
+ break if !is_insert
712
+ end
713
+ # insert to temporary accepted row if all query return true
714
+ row_tmp << _row_tmp.join(col_sep) if is_insert
715
+ end
716
+ end
717
+ row.push(*row_tmp) if row_tmp.size > 0
718
+ break if row.size >= self.preload || self.file_object.eof?
719
+ end
720
+ self.current_preload = CSV.parse(row.join(col_sep).gsub("#{row_sep},", row_sep), self.parse_options)
721
+ end
722
+
723
+ # if object from this class will reuse, call this method to reopen file so you
724
+ # can read file again.
725
+ def reopen
726
+ begin
727
+ self.file_object = File.open(self.file_path, 'r')
728
+ self.line_count = 1
729
+ self.current = []
730
+ self.current_preload = []
731
+ self.file_object.readline if self.has_header && !self.define_header_by_your_self
732
+ true
733
+ rescue => e
734
+ raise e
735
+ end
736
+ end
737
+
738
+ def self.export(file)
739
+ if block_given?
740
+ File.open(file, 'w')
741
+ File.open(file, 'a') do |file|
742
+ yield file
743
+ end
744
+ else
745
+ raise NotImplementedError, "block must given."
746
+ end
747
+ end
748
+ end
metadata ADDED
@@ -0,0 +1,44 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: import_csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Kasyfil Aziz Tri Cahyana
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-07-12 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Faster & most memory efficient to import large csv file in Ruby.
14
+ email: tricahyana@windowslive.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - lib/import_csv.rb
20
+ homepage: http://rubygems.org/gems/import_csv
21
+ licenses:
22
+ - MIT
23
+ metadata: {}
24
+ post_install_message:
25
+ rdoc_options: []
26
+ require_paths:
27
+ - lib
28
+ required_ruby_version: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ required_rubygems_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ requirements: []
39
+ rubyforge_project:
40
+ rubygems_version: 2.5.1
41
+ signing_key:
42
+ specification_version: 4
43
+ summary: Import CSV
44
+ test_files: []