csv-indexer 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/csv-indexer.rb +97 -67
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 447d47db02c690a8bda0c1418a3373363c52f1e9295e962f006ddf5123f10c42
|
4
|
+
data.tar.gz: 0f0f2a0910b7e5d4128d0f132acc071091cad18166251f873593015a8f9bf1d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 541ed58833c51c0f89e5bc4d217fb5b28fef3613b97f12711bd710a1d9e8bce425a69b7574f229b88c1bcf2caae2ad7fbb4db76d155c54b214f543d723e18823
|
7
|
+
data.tar.gz: 5b68297cf8b08d43618dc6ef34474ca30988fae542fcdd4cdf8f729c77d416a878f265d37faaf7aa640a91f2b101e517171eb979d141163c272da1c7ea71cfcd
|
data/lib/csv-indexer.rb
CHANGED
@@ -99,6 +99,7 @@ module BlackStack
|
|
99
99
|
l = write_log ? self.logger : BlackStack::DummyLogger.new
|
100
100
|
# output file extension
|
101
101
|
ext = ".#{self.name}"
|
102
|
+
|
102
103
|
# index the bites
|
103
104
|
Dir.glob(input).each do |file|
|
104
105
|
# get the name of the file from the full path
|
@@ -120,30 +121,34 @@ module BlackStack
|
|
120
121
|
a = []
|
121
122
|
# iterate lines if input_file
|
122
123
|
input_file.each_line do |line|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
124
|
+
begin
|
125
|
+
i += 1
|
126
|
+
fields = []
|
127
|
+
key = []
|
128
|
+
# get the array of fields
|
129
|
+
row = CSV.parse_line(line)
|
130
|
+
# build the key
|
131
|
+
self.keys.each do |k|
|
132
|
+
colnum = self.mapping[k]
|
133
|
+
# replace '"' by empty string, and '|' with ','
|
134
|
+
key << row[colnum].gsub('"', '').gsub('|', ',')
|
135
|
+
end
|
136
|
+
key = "\"#{key.join('|')}\""
|
137
|
+
# add the key as the first field of the index line
|
138
|
+
fields << key
|
139
|
+
# add the row number as the second field of the index line
|
140
|
+
fields << "\"#{i.to_s}\""
|
141
|
+
# iterate the mapping
|
142
|
+
self.mapping.each do |k, v|
|
143
|
+
# get the data from the row
|
144
|
+
# format the field values for the CSV
|
145
|
+
fields << "\"#{row[v].to_s.gsub('"', '')}\""
|
146
|
+
end
|
147
|
+
# add fields to the array
|
148
|
+
a << fields
|
149
|
+
rescue => e
|
150
|
+
# what to do with this?
|
133
151
|
end
|
134
|
-
key = "\"#{key.join('|')}\""
|
135
|
-
# add the key as the first field of the index line
|
136
|
-
fields << key
|
137
|
-
# add the row number as the second field of the index line
|
138
|
-
fields << "\"#{i.to_s}\""
|
139
|
-
# iterate the mapping
|
140
|
-
self.mapping.each do |k, v|
|
141
|
-
# get the data from the row
|
142
|
-
# format the field values for the CSV
|
143
|
-
fields << "\"#{row[v].gsub('"', '')}\""
|
144
|
-
end
|
145
|
-
# add fields to the array
|
146
|
-
a << fields
|
147
152
|
end
|
148
153
|
# sort the array
|
149
154
|
a.sort!
|
@@ -193,9 +198,11 @@ module BlackStack
|
|
193
198
|
a2 = key2 #.split('|')
|
194
199
|
# validation: a2.size > a1.size
|
195
200
|
raise 'The key2 must has more elements than key1.' if a2.size < a1.size
|
201
|
+
#binding.pry if a2[0].include?('anubhava-mishra-1b668124')
|
196
202
|
# iterate the arrays
|
197
203
|
a2.each_with_index do |k, i|
|
198
|
-
|
204
|
+
#binding.pry if k.include?('anubhava-mishra-1b668124')
|
205
|
+
match = false if k !~ /^#{Regexp.escape(a1[i].to_s)}/i
|
199
206
|
end
|
200
207
|
return 0 if match && !exact_match
|
201
208
|
# return the result
|
@@ -248,7 +255,7 @@ module BlackStack
|
|
248
255
|
l.logs "Searching into #{name}... "
|
249
256
|
# setting boundaries for the binary search
|
250
257
|
i = 0
|
251
|
-
max = `wc -c #{file}`.split(' ').first.to_i
|
258
|
+
max = `wc -c '#{file}'`.split(' ').first.to_i
|
252
259
|
middle = ((i + max) / 2).to_i
|
253
260
|
# totals
|
254
261
|
# open file with random access
|
@@ -260,55 +267,74 @@ module BlackStack
|
|
260
267
|
# get the middle of the file
|
261
268
|
middle = ((i + max) / 2).to_i
|
262
269
|
# break if the middle is the same as the previous iteration
|
270
|
+
#binding.pry if middle==prev
|
263
271
|
break if middle==prev
|
264
272
|
# remember the middle in this iteration
|
265
273
|
prev = middle
|
266
274
|
# opening log line
|
267
275
|
l.logs "#{middle}... "
|
268
|
-
#
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
line = f.readline
|
274
|
-
# most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
|
275
|
-
a = line.split('","')
|
276
|
-
while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
|
277
|
-
middle -= 1
|
276
|
+
# processing the line
|
277
|
+
line = ''
|
278
|
+
line_size = 0
|
279
|
+
begin
|
280
|
+
# go to the middle of the file
|
278
281
|
f.seek(middle)
|
279
|
-
|
282
|
+
# read the line
|
283
|
+
# the cursor is at the middle of a line
|
284
|
+
# so, I have to read a second line to get a full line
|
285
|
+
line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
|
286
|
+
#binding.pry if line.include?('anubhav521@gmail.com')
|
287
|
+
# most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
|
280
288
|
a = line.split('","')
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
line = f.readline
|
287
|
-
# strip the line
|
288
|
-
line.strip!
|
289
|
-
# get the first field of the CSV line
|
290
|
-
fields = CSV.parse_line(line)
|
291
|
-
row_key = fields[0].split('|')
|
292
|
-
# compare keys
|
293
|
-
x = compare_keys(key, row_key, exact_match)
|
294
|
-
# compare the first field with the search term
|
295
|
-
if x == 0
|
296
|
-
# found
|
297
|
-
l.logf "found (#{row_key})"
|
298
|
-
ret[:matches] << fields.dup
|
299
|
-
total_matches += 1
|
300
|
-
break
|
301
|
-
else
|
302
|
-
# not found
|
303
|
-
if x == 1
|
304
|
-
# search in the down half
|
305
|
-
max = middle
|
306
|
-
else #if x == -1
|
307
|
-
# search in the up half
|
308
|
-
i = middle + line.size+1
|
289
|
+
while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
|
290
|
+
middle -= 1
|
291
|
+
f.seek(middle)
|
292
|
+
line = f.readline
|
293
|
+
a = line.split('","')
|
309
294
|
end
|
310
|
-
|
311
|
-
|
295
|
+
line_size = a.last.gsub('"', '').to_i
|
296
|
+
middle -= line_size-line.size+1
|
297
|
+
# seek and readline again, to get the line from its begining
|
298
|
+
f.seek(middle)
|
299
|
+
line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
|
300
|
+
# BAD PRACTIVCE PATCH: sometimes the new value of middle (`middle -= line_size-line.size+1`) doesn't hit the starting of the line.
|
301
|
+
while line[0] != '"'
|
302
|
+
middle -= 1
|
303
|
+
f.seek(middle)
|
304
|
+
line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
|
305
|
+
end
|
306
|
+
# strip the line
|
307
|
+
line.strip!
|
308
|
+
# get the first field of the CSV line
|
309
|
+
fields = CSV.parse_line(line)
|
310
|
+
row_key = fields[0].split('|')
|
311
|
+
# compare keys
|
312
|
+
#binding.pry if line.include?('anubhava-mishra-1b668124')
|
313
|
+
x = compare_keys(key, row_key, exact_match)
|
314
|
+
# compare the first field with the search term
|
315
|
+
if x == 0
|
316
|
+
# found
|
317
|
+
l.logf "found (#{row_key})"
|
318
|
+
ret[:matches] << fields.dup
|
319
|
+
total_matches += 1
|
320
|
+
break
|
321
|
+
else
|
322
|
+
# not found
|
323
|
+
if x == 1
|
324
|
+
# search in the down half
|
325
|
+
max = middle
|
326
|
+
else #if x == -1
|
327
|
+
# search in the up half
|
328
|
+
i = middle + line.size+1
|
329
|
+
end
|
330
|
+
l.logf "not found (#{row_key})"
|
331
|
+
end
|
332
|
+
rescue => e
|
333
|
+
l.logf "error in line `#{line}`: #{e.to_console}"
|
334
|
+
# change the max, in order to don't repeat the same iteration and exit the loop in the line `break if middle==prev`
|
335
|
+
#i+=1
|
336
|
+
#max+=1
|
337
|
+
end # begin
|
312
338
|
end
|
313
339
|
# closing the file
|
314
340
|
f.close
|
@@ -316,12 +342,16 @@ module BlackStack
|
|
316
342
|
l.done
|
317
343
|
# increment file counter
|
318
344
|
n += 1
|
345
|
+
# tracing log
|
346
|
+
l.log "i: #{i.to_s}"
|
347
|
+
l.log "max: #{max.to_s}"
|
319
348
|
end
|
320
|
-
|
349
|
+
# end time
|
321
350
|
end_time = Time.now
|
322
351
|
|
323
352
|
ret[:enlapsed_seconds] = end_time - start_time
|
324
353
|
ret[:lines_matched] = total_matches
|
354
|
+
ret[:files_processed] = n
|
325
355
|
|
326
356
|
l.log "Matches: #{total_matches.to_s}"
|
327
357
|
l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
|