csv-indexer 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/csv-indexer.rb +97 -67
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
4
- data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
3
+ metadata.gz: 447d47db02c690a8bda0c1418a3373363c52f1e9295e962f006ddf5123f10c42
4
+ data.tar.gz: 0f0f2a0910b7e5d4128d0f132acc071091cad18166251f873593015a8f9bf1d8
5
5
  SHA512:
6
- metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
7
- data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'
6
+ metadata.gz: 541ed58833c51c0f89e5bc4d217fb5b28fef3613b97f12711bd710a1d9e8bce425a69b7574f229b88c1bcf2caae2ad7fbb4db76d155c54b214f543d723e18823
7
+ data.tar.gz: 5b68297cf8b08d43618dc6ef34474ca30988fae542fcdd4cdf8f729c77d416a878f265d37faaf7aa640a91f2b101e517171eb979d141163c272da1c7ea71cfcd
data/lib/csv-indexer.rb CHANGED
@@ -99,6 +99,7 @@ module BlackStack
99
99
  l = write_log ? self.logger : BlackStack::DummyLogger.new
100
100
  # output file extension
101
101
  ext = ".#{self.name}"
102
+
102
103
  # index the bites
103
104
  Dir.glob(input).each do |file|
104
105
  # get the name of the file from the full path
@@ -120,30 +121,34 @@ module BlackStack
120
121
  a = []
121
122
  # iterate lines if input_file
122
123
  input_file.each_line do |line|
123
- i += 1
124
- fields = []
125
- key = []
126
- # get the array of fields
127
- row = CSV.parse_line(line)
128
- # build the key
129
- self.keys.each do |k|
130
- colnum = self.mapping[k]
131
- # replace '"' by empty string, and '|' with ','
132
- key << row[colnum].gsub('"', '').gsub('|', ',')
124
+ begin
125
+ i += 1
126
+ fields = []
127
+ key = []
128
+ # get the array of fields
129
+ row = CSV.parse_line(line)
130
+ # build the key
131
+ self.keys.each do |k|
132
+ colnum = self.mapping[k]
133
+ # replace '"' by empty string, and '|' with ','
134
+ key << row[colnum].gsub('"', '').gsub('|', ',')
135
+ end
136
+ key = "\"#{key.join('|')}\""
137
+ # add the key as the first field of the index line
138
+ fields << key
139
+ # add the row number as the second field of the index line
140
+ fields << "\"#{i.to_s}\""
141
+ # iterate the mapping
142
+ self.mapping.each do |k, v|
143
+ # get the data from the row
144
+ # format the field values for the CSV
145
+ fields << "\"#{row[v].to_s.gsub('"', '')}\""
146
+ end
147
+ # add fields to the array
148
+ a << fields
149
+ rescue => e
150
+ # what to do with this?
133
151
  end
134
- key = "\"#{key.join('|')}\""
135
- # add the key as the first field of the index line
136
- fields << key
137
- # add the row number as the second field of the index line
138
- fields << "\"#{i.to_s}\""
139
- # iterate the mapping
140
- self.mapping.each do |k, v|
141
- # get the data from the row
142
- # format the field values for the CSV
143
- fields << "\"#{row[v].gsub('"', '')}\""
144
- end
145
- # add fields to the array
146
- a << fields
147
152
  end
148
153
  # sort the array
149
154
  a.sort!
@@ -193,9 +198,11 @@ module BlackStack
193
198
  a2 = key2 #.split('|')
194
199
  # validation: a2.size > a1.size
195
200
  raise 'The key2 must has more elements than key1.' if a2.size < a1.size
201
+ #binding.pry if a2[0].include?('anubhava-mishra-1b668124')
196
202
  # iterate the arrays
197
203
  a2.each_with_index do |k, i|
198
- match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
204
+ #binding.pry if k.include?('anubhava-mishra-1b668124')
205
+ match = false if k !~ /^#{Regexp.escape(a1[i].to_s)}/i
199
206
  end
200
207
  return 0 if match && !exact_match
201
208
  # return the result
@@ -248,7 +255,7 @@ module BlackStack
248
255
  l.logs "Searching into #{name}... "
249
256
  # setting boundaries for the binary search
250
257
  i = 0
251
- max = `wc -c #{file}`.split(' ').first.to_i
258
+ max = `wc -c '#{file}'`.split(' ').first.to_i
252
259
  middle = ((i + max) / 2).to_i
253
260
  # totals
254
261
  # open file with random access
@@ -260,55 +267,74 @@ module BlackStack
260
267
  # get the middle of the file
261
268
  middle = ((i + max) / 2).to_i
262
269
  # break if the middle is the same as the previous iteration
270
+ #binding.pry if middle==prev
263
271
  break if middle==prev
264
272
  # remember the middle in this iteration
265
273
  prev = middle
266
274
  # opening log line
267
275
  l.logs "#{middle}... "
268
- # go to the middle of the file
269
- f.seek(middle)
270
- # read the line
271
- # the cursor is at the middle of a line
272
- # so, I have to read a second line to get a full line
273
- line = f.readline
274
- # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
275
- a = line.split('","')
276
- while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
277
- middle -= 1
276
+ # processing the line
277
+ line = ''
278
+ line_size = 0
279
+ begin
280
+ # go to the middle of the file
278
281
  f.seek(middle)
279
- line = f.readline
282
+ # read the line
283
+ # the cursor is at the middle of a line
284
+ # so, I have to read a second line to get a full line
285
+ line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
286
+ #binding.pry if line.include?('anubhav521@gmail.com')
287
+ # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
280
288
  a = line.split('","')
281
- end
282
- line_size = a.last.gsub('"', '').to_i
283
- middle -= line_size-line.size+1
284
- # seek and readline again, to get the line from its begining
285
- f.seek(middle)
286
- line = f.readline
287
- # strip the line
288
- line.strip!
289
- # get the first field of the CSV line
290
- fields = CSV.parse_line(line)
291
- row_key = fields[0].split('|')
292
- # compare keys
293
- x = compare_keys(key, row_key, exact_match)
294
- # compare the first field with the search term
295
- if x == 0
296
- # found
297
- l.logf "found (#{row_key})"
298
- ret[:matches] << fields.dup
299
- total_matches += 1
300
- break
301
- else
302
- # not found
303
- if x == 1
304
- # search in the down half
305
- max = middle
306
- else #if x == -1
307
- # search in the up half
308
- i = middle + line.size+1
289
+ while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
290
+ middle -= 1
291
+ f.seek(middle)
292
+ line = f.readline
293
+ a = line.split('","')
309
294
  end
310
- l.logf "not found (#{row_key})"
311
- end
295
+ line_size = a.last.gsub('"', '').to_i
296
+ middle -= line_size-line.size+1
297
+ # seek and readline again, to get the line from its begining
298
+ f.seek(middle)
299
+ line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
300
+ # BAD PRACTIVCE PATCH: sometimes the new value of middle (`middle -= line_size-line.size+1`) doesn't hit the starting of the line.
301
+ while line[0] != '"'
302
+ middle -= 1
303
+ f.seek(middle)
304
+ line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
305
+ end
306
+ # strip the line
307
+ line.strip!
308
+ # get the first field of the CSV line
309
+ fields = CSV.parse_line(line)
310
+ row_key = fields[0].split('|')
311
+ # compare keys
312
+ #binding.pry if line.include?('anubhava-mishra-1b668124')
313
+ x = compare_keys(key, row_key, exact_match)
314
+ # compare the first field with the search term
315
+ if x == 0
316
+ # found
317
+ l.logf "found (#{row_key})"
318
+ ret[:matches] << fields.dup
319
+ total_matches += 1
320
+ break
321
+ else
322
+ # not found
323
+ if x == 1
324
+ # search in the down half
325
+ max = middle
326
+ else #if x == -1
327
+ # search in the up half
328
+ i = middle + line.size+1
329
+ end
330
+ l.logf "not found (#{row_key})"
331
+ end
332
+ rescue => e
333
+ l.logf "error in line `#{line}`: #{e.to_console}"
334
+ # change the max, in order to don't repeat the same iteration and exit the loop in the line `break if middle==prev`
335
+ #i+=1
336
+ #max+=1
337
+ end # begin
312
338
  end
313
339
  # closing the file
314
340
  f.close
@@ -316,12 +342,16 @@ module BlackStack
316
342
  l.done
317
343
  # increment file counter
318
344
  n += 1
345
+ # tracing log
346
+ l.log "i: #{i.to_s}"
347
+ l.log "max: #{max.to_s}"
319
348
  end
320
-
349
+ # end time
321
350
  end_time = Time.now
322
351
 
323
352
  ret[:enlapsed_seconds] = end_time - start_time
324
353
  ret[:lines_matched] = total_matches
354
+ ret[:files_processed] = n
325
355
 
326
356
  l.log "Matches: #{total_matches.to_s}"
327
357
  l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-indexer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi