csv-indexer 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/csv-indexer.rb +97 -67
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 447d47db02c690a8bda0c1418a3373363c52f1e9295e962f006ddf5123f10c42
|
4
|
+
data.tar.gz: 0f0f2a0910b7e5d4128d0f132acc071091cad18166251f873593015a8f9bf1d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 541ed58833c51c0f89e5bc4d217fb5b28fef3613b97f12711bd710a1d9e8bce425a69b7574f229b88c1bcf2caae2ad7fbb4db76d155c54b214f543d723e18823
|
7
|
+
data.tar.gz: 5b68297cf8b08d43618dc6ef34474ca30988fae542fcdd4cdf8f729c77d416a878f265d37faaf7aa640a91f2b101e517171eb979d141163c272da1c7ea71cfcd
|
data/lib/csv-indexer.rb
CHANGED
@@ -99,6 +99,7 @@ module BlackStack
|
|
99
99
|
l = write_log ? self.logger : BlackStack::DummyLogger.new
|
100
100
|
# output file extension
|
101
101
|
ext = ".#{self.name}"
|
102
|
+
|
102
103
|
# index the bites
|
103
104
|
Dir.glob(input).each do |file|
|
104
105
|
# get the name of the file from the full path
|
@@ -120,30 +121,34 @@ module BlackStack
|
|
120
121
|
a = []
|
121
122
|
# iterate lines if input_file
|
122
123
|
input_file.each_line do |line|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
124
|
+
begin
|
125
|
+
i += 1
|
126
|
+
fields = []
|
127
|
+
key = []
|
128
|
+
# get the array of fields
|
129
|
+
row = CSV.parse_line(line)
|
130
|
+
# build the key
|
131
|
+
self.keys.each do |k|
|
132
|
+
colnum = self.mapping[k]
|
133
|
+
# replace '"' by empty string, and '|' with ','
|
134
|
+
key << row[colnum].gsub('"', '').gsub('|', ',')
|
135
|
+
end
|
136
|
+
key = "\"#{key.join('|')}\""
|
137
|
+
# add the key as the first field of the index line
|
138
|
+
fields << key
|
139
|
+
# add the row number as the second field of the index line
|
140
|
+
fields << "\"#{i.to_s}\""
|
141
|
+
# iterate the mapping
|
142
|
+
self.mapping.each do |k, v|
|
143
|
+
# get the data from the row
|
144
|
+
# format the field values for the CSV
|
145
|
+
fields << "\"#{row[v].to_s.gsub('"', '')}\""
|
146
|
+
end
|
147
|
+
# add fields to the array
|
148
|
+
a << fields
|
149
|
+
rescue => e
|
150
|
+
# what to do with this?
|
133
151
|
end
|
134
|
-
key = "\"#{key.join('|')}\""
|
135
|
-
# add the key as the first field of the index line
|
136
|
-
fields << key
|
137
|
-
# add the row number as the second field of the index line
|
138
|
-
fields << "\"#{i.to_s}\""
|
139
|
-
# iterate the mapping
|
140
|
-
self.mapping.each do |k, v|
|
141
|
-
# get the data from the row
|
142
|
-
# format the field values for the CSV
|
143
|
-
fields << "\"#{row[v].gsub('"', '')}\""
|
144
|
-
end
|
145
|
-
# add fields to the array
|
146
|
-
a << fields
|
147
152
|
end
|
148
153
|
# sort the array
|
149
154
|
a.sort!
|
@@ -193,9 +198,11 @@ module BlackStack
|
|
193
198
|
a2 = key2 #.split('|')
|
194
199
|
# validation: a2.size > a1.size
|
195
200
|
raise 'The key2 must has more elements than key1.' if a2.size < a1.size
|
201
|
+
#binding.pry if a2[0].include?('anubhava-mishra-1b668124')
|
196
202
|
# iterate the arrays
|
197
203
|
a2.each_with_index do |k, i|
|
198
|
-
|
204
|
+
#binding.pry if k.include?('anubhava-mishra-1b668124')
|
205
|
+
match = false if k !~ /^#{Regexp.escape(a1[i].to_s)}/i
|
199
206
|
end
|
200
207
|
return 0 if match && !exact_match
|
201
208
|
# return the result
|
@@ -248,7 +255,7 @@ module BlackStack
|
|
248
255
|
l.logs "Searching into #{name}... "
|
249
256
|
# setting boundaries for the binary search
|
250
257
|
i = 0
|
251
|
-
max = `wc -c #{file}`.split(' ').first.to_i
|
258
|
+
max = `wc -c '#{file}'`.split(' ').first.to_i
|
252
259
|
middle = ((i + max) / 2).to_i
|
253
260
|
# totals
|
254
261
|
# open file with random access
|
@@ -260,55 +267,74 @@ module BlackStack
|
|
260
267
|
# get the middle of the file
|
261
268
|
middle = ((i + max) / 2).to_i
|
262
269
|
# break if the middle is the same as the previous iteration
|
270
|
+
#binding.pry if middle==prev
|
263
271
|
break if middle==prev
|
264
272
|
# remember the middle in this iteration
|
265
273
|
prev = middle
|
266
274
|
# opening log line
|
267
275
|
l.logs "#{middle}... "
|
268
|
-
#
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
line = f.readline
|
274
|
-
# most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
|
275
|
-
a = line.split('","')
|
276
|
-
while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
|
277
|
-
middle -= 1
|
276
|
+
# processing the line
|
277
|
+
line = ''
|
278
|
+
line_size = 0
|
279
|
+
begin
|
280
|
+
# go to the middle of the file
|
278
281
|
f.seek(middle)
|
279
|
-
|
282
|
+
# read the line
|
283
|
+
# the cursor is at the middle of a line
|
284
|
+
# so, I have to read a second line to get a full line
|
285
|
+
line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
|
286
|
+
#binding.pry if line.include?('anubhav521@gmail.com')
|
287
|
+
# most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
|
280
288
|
a = line.split('","')
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
line = f.readline
|
287
|
-
# strip the line
|
288
|
-
line.strip!
|
289
|
-
# get the first field of the CSV line
|
290
|
-
fields = CSV.parse_line(line)
|
291
|
-
row_key = fields[0].split('|')
|
292
|
-
# compare keys
|
293
|
-
x = compare_keys(key, row_key, exact_match)
|
294
|
-
# compare the first field with the search term
|
295
|
-
if x == 0
|
296
|
-
# found
|
297
|
-
l.logf "found (#{row_key})"
|
298
|
-
ret[:matches] << fields.dup
|
299
|
-
total_matches += 1
|
300
|
-
break
|
301
|
-
else
|
302
|
-
# not found
|
303
|
-
if x == 1
|
304
|
-
# search in the down half
|
305
|
-
max = middle
|
306
|
-
else #if x == -1
|
307
|
-
# search in the up half
|
308
|
-
i = middle + line.size+1
|
289
|
+
while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
|
290
|
+
middle -= 1
|
291
|
+
f.seek(middle)
|
292
|
+
line = f.readline
|
293
|
+
a = line.split('","')
|
309
294
|
end
|
310
|
-
|
311
|
-
|
295
|
+
line_size = a.last.gsub('"', '').to_i
|
296
|
+
middle -= line_size-line.size+1
|
297
|
+
# seek and readline again, to get the line from its begining
|
298
|
+
f.seek(middle)
|
299
|
+
line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
|
300
|
+
# BAD PRACTIVCE PATCH: sometimes the new value of middle (`middle -= line_size-line.size+1`) doesn't hit the starting of the line.
|
301
|
+
while line[0] != '"'
|
302
|
+
middle -= 1
|
303
|
+
f.seek(middle)
|
304
|
+
line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
|
305
|
+
end
|
306
|
+
# strip the line
|
307
|
+
line.strip!
|
308
|
+
# get the first field of the CSV line
|
309
|
+
fields = CSV.parse_line(line)
|
310
|
+
row_key = fields[0].split('|')
|
311
|
+
# compare keys
|
312
|
+
#binding.pry if line.include?('anubhava-mishra-1b668124')
|
313
|
+
x = compare_keys(key, row_key, exact_match)
|
314
|
+
# compare the first field with the search term
|
315
|
+
if x == 0
|
316
|
+
# found
|
317
|
+
l.logf "found (#{row_key})"
|
318
|
+
ret[:matches] << fields.dup
|
319
|
+
total_matches += 1
|
320
|
+
break
|
321
|
+
else
|
322
|
+
# not found
|
323
|
+
if x == 1
|
324
|
+
# search in the down half
|
325
|
+
max = middle
|
326
|
+
else #if x == -1
|
327
|
+
# search in the up half
|
328
|
+
i = middle + line.size+1
|
329
|
+
end
|
330
|
+
l.logf "not found (#{row_key})"
|
331
|
+
end
|
332
|
+
rescue => e
|
333
|
+
l.logf "error in line `#{line}`: #{e.to_console}"
|
334
|
+
# change the max, in order to don't repeat the same iteration and exit the loop in the line `break if middle==prev`
|
335
|
+
#i+=1
|
336
|
+
#max+=1
|
337
|
+
end # begin
|
312
338
|
end
|
313
339
|
# closing the file
|
314
340
|
f.close
|
@@ -316,12 +342,16 @@ module BlackStack
|
|
316
342
|
l.done
|
317
343
|
# increment file counter
|
318
344
|
n += 1
|
345
|
+
# tracing log
|
346
|
+
l.log "i: #{i.to_s}"
|
347
|
+
l.log "max: #{max.to_s}"
|
319
348
|
end
|
320
|
-
|
349
|
+
# end time
|
321
350
|
end_time = Time.now
|
322
351
|
|
323
352
|
ret[:enlapsed_seconds] = end_time - start_time
|
324
353
|
ret[:lines_matched] = total_matches
|
354
|
+
ret[:files_processed] = n
|
325
355
|
|
326
356
|
l.log "Matches: #{total_matches.to_s}"
|
327
357
|
l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
|