csv-indexer 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/csv-indexer.rb +97 -67
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
4
- data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
3
+ metadata.gz: 447d47db02c690a8bda0c1418a3373363c52f1e9295e962f006ddf5123f10c42
4
+ data.tar.gz: 0f0f2a0910b7e5d4128d0f132acc071091cad18166251f873593015a8f9bf1d8
5
5
  SHA512:
6
- metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
7
- data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'
6
+ metadata.gz: 541ed58833c51c0f89e5bc4d217fb5b28fef3613b97f12711bd710a1d9e8bce425a69b7574f229b88c1bcf2caae2ad7fbb4db76d155c54b214f543d723e18823
7
+ data.tar.gz: 5b68297cf8b08d43618dc6ef34474ca30988fae542fcdd4cdf8f729c77d416a878f265d37faaf7aa640a91f2b101e517171eb979d141163c272da1c7ea71cfcd
data/lib/csv-indexer.rb CHANGED
@@ -99,6 +99,7 @@ module BlackStack
99
99
  l = write_log ? self.logger : BlackStack::DummyLogger.new
100
100
  # output file extension
101
101
  ext = ".#{self.name}"
102
+
102
103
  # index the bites
103
104
  Dir.glob(input).each do |file|
104
105
  # get the name of the file from the full path
@@ -120,30 +121,34 @@ module BlackStack
120
121
  a = []
121
122
  # iterate lines if input_file
122
123
  input_file.each_line do |line|
123
- i += 1
124
- fields = []
125
- key = []
126
- # get the array of fields
127
- row = CSV.parse_line(line)
128
- # build the key
129
- self.keys.each do |k|
130
- colnum = self.mapping[k]
131
- # replace '"' by empty string, and '|' with ','
132
- key << row[colnum].gsub('"', '').gsub('|', ',')
124
+ begin
125
+ i += 1
126
+ fields = []
127
+ key = []
128
+ # get the array of fields
129
+ row = CSV.parse_line(line)
130
+ # build the key
131
+ self.keys.each do |k|
132
+ colnum = self.mapping[k]
133
+ # replace '"' by empty string, and '|' with ','
134
+ key << row[colnum].gsub('"', '').gsub('|', ',')
135
+ end
136
+ key = "\"#{key.join('|')}\""
137
+ # add the key as the first field of the index line
138
+ fields << key
139
+ # add the row number as the second field of the index line
140
+ fields << "\"#{i.to_s}\""
141
+ # iterate the mapping
142
+ self.mapping.each do |k, v|
143
+ # get the data from the row
144
+ # format the field values for the CSV
145
+ fields << "\"#{row[v].to_s.gsub('"', '')}\""
146
+ end
147
+ # add fields to the array
148
+ a << fields
149
+ rescue => e
150
+ # what to do with this?
133
151
  end
134
- key = "\"#{key.join('|')}\""
135
- # add the key as the first field of the index line
136
- fields << key
137
- # add the row number as the second field of the index line
138
- fields << "\"#{i.to_s}\""
139
- # iterate the mapping
140
- self.mapping.each do |k, v|
141
- # get the data from the row
142
- # format the field values for the CSV
143
- fields << "\"#{row[v].gsub('"', '')}\""
144
- end
145
- # add fields to the array
146
- a << fields
147
152
  end
148
153
  # sort the array
149
154
  a.sort!
@@ -193,9 +198,11 @@ module BlackStack
193
198
  a2 = key2 #.split('|')
194
199
  # validation: a2.size > a1.size
195
200
  raise 'The key2 must has more elements than key1.' if a2.size < a1.size
201
+ #binding.pry if a2[0].include?('anubhava-mishra-1b668124')
196
202
  # iterate the arrays
197
203
  a2.each_with_index do |k, i|
198
- match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
204
+ #binding.pry if k.include?('anubhava-mishra-1b668124')
205
+ match = false if k !~ /^#{Regexp.escape(a1[i].to_s)}/i
199
206
  end
200
207
  return 0 if match && !exact_match
201
208
  # return the result
@@ -248,7 +255,7 @@ module BlackStack
248
255
  l.logs "Searching into #{name}... "
249
256
  # setting boundaries for the binary search
250
257
  i = 0
251
- max = `wc -c #{file}`.split(' ').first.to_i
258
+ max = `wc -c '#{file}'`.split(' ').first.to_i
252
259
  middle = ((i + max) / 2).to_i
253
260
  # totals
254
261
  # open file with random access
@@ -260,55 +267,74 @@ module BlackStack
260
267
  # get the middle of the file
261
268
  middle = ((i + max) / 2).to_i
262
269
  # break if the middle is the same as the previous iteration
270
+ #binding.pry if middle==prev
263
271
  break if middle==prev
264
272
  # remember the middle in this iteration
265
273
  prev = middle
266
274
  # opening log line
267
275
  l.logs "#{middle}... "
268
- # go to the middle of the file
269
- f.seek(middle)
270
- # read the line
271
- # the cursor is at the middle of a line
272
- # so, I have to read a second line to get a full line
273
- line = f.readline
274
- # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
275
- a = line.split('","')
276
- while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
277
- middle -= 1
276
+ # processing the line
277
+ line = ''
278
+ line_size = 0
279
+ begin
280
+ # go to the middle of the file
278
281
  f.seek(middle)
279
- line = f.readline
282
+ # read the line
283
+ # the cursor is at the middle of a line
284
+ # so, I have to read a second line to get a full line
285
+ line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
286
+ #binding.pry if line.include?('anubhav521@gmail.com')
287
+ # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
280
288
  a = line.split('","')
281
- end
282
- line_size = a.last.gsub('"', '').to_i
283
- middle -= line_size-line.size+1
284
- # seek and readline again, to get the line from its begining
285
- f.seek(middle)
286
- line = f.readline
287
- # strip the line
288
- line.strip!
289
- # get the first field of the CSV line
290
- fields = CSV.parse_line(line)
291
- row_key = fields[0].split('|')
292
- # compare keys
293
- x = compare_keys(key, row_key, exact_match)
294
- # compare the first field with the search term
295
- if x == 0
296
- # found
297
- l.logf "found (#{row_key})"
298
- ret[:matches] << fields.dup
299
- total_matches += 1
300
- break
301
- else
302
- # not found
303
- if x == 1
304
- # search in the down half
305
- max = middle
306
- else #if x == -1
307
- # search in the up half
308
- i = middle + line.size+1
289
+ while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
290
+ middle -= 1
291
+ f.seek(middle)
292
+ line = f.readline
293
+ a = line.split('","')
309
294
  end
310
- l.logf "not found (#{row_key})"
311
- end
295
+ line_size = a.last.gsub('"', '').to_i
296
+ middle -= line_size-line.size+1
297
+ # seek and readline again, to get the line from its begining
298
+ f.seek(middle)
299
+ line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
300
+ # BAD PRACTIVCE PATCH: sometimes the new value of middle (`middle -= line_size-line.size+1`) doesn't hit the starting of the line.
301
+ while line[0] != '"'
302
+ middle -= 1
303
+ f.seek(middle)
304
+ line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
305
+ end
306
+ # strip the line
307
+ line.strip!
308
+ # get the first field of the CSV line
309
+ fields = CSV.parse_line(line)
310
+ row_key = fields[0].split('|')
311
+ # compare keys
312
+ #binding.pry if line.include?('anubhava-mishra-1b668124')
313
+ x = compare_keys(key, row_key, exact_match)
314
+ # compare the first field with the search term
315
+ if x == 0
316
+ # found
317
+ l.logf "found (#{row_key})"
318
+ ret[:matches] << fields.dup
319
+ total_matches += 1
320
+ break
321
+ else
322
+ # not found
323
+ if x == 1
324
+ # search in the down half
325
+ max = middle
326
+ else #if x == -1
327
+ # search in the up half
328
+ i = middle + line.size+1
329
+ end
330
+ l.logf "not found (#{row_key})"
331
+ end
332
+ rescue => e
333
+ l.logf "error in line `#{line}`: #{e.to_console}"
334
+ # change the max, in order to don't repeat the same iteration and exit the loop in the line `break if middle==prev`
335
+ #i+=1
336
+ #max+=1
337
+ end # begin
312
338
  end
313
339
  # closing the file
314
340
  f.close
@@ -316,12 +342,16 @@ module BlackStack
316
342
  l.done
317
343
  # increment file counter
318
344
  n += 1
345
+ # tracing log
346
+ l.log "i: #{i.to_s}"
347
+ l.log "max: #{max.to_s}"
319
348
  end
320
-
349
+ # end time
321
350
  end_time = Time.now
322
351
 
323
352
  ret[:enlapsed_seconds] = end_time - start_time
324
353
  ret[:lines_matched] = total_matches
354
+ ret[:files_processed] = n
325
355
 
326
356
  l.log "Matches: #{total_matches.to_s}"
327
357
  l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi