csv-indexer 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/csv-indexer.rb +333 -0
- metadata +86 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
|
4
|
+
data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
|
7
|
+
data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'
|
data/lib/csv-indexer.rb
ADDED
@@ -0,0 +1,333 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'simple_cloud_logging'
|
3
|
+
|
4
|
+
module BlackStack
|
5
|
+
module CSVIndexer
|
6
|
+
@indexes = []
|
7
|
+
|
8
|
+
def self.indexes
|
9
|
+
@indexes
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.add_indexation(h)
|
13
|
+
@indexes << BlackStack::CSVIndexer::Index.new(h)
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.index(name, write_log=true)
|
17
|
+
i = @indexes.select { |i| i.name = name }.first
|
18
|
+
raise 'Index not found.' if i.nil?
|
19
|
+
i.index(write_log)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.find(name, key, exact_match=true, write_log=false)
|
23
|
+
i = @indexes.select { |i| i.name = name }.first
|
24
|
+
raise 'Index not found.' if i.nil?
|
25
|
+
i.find(key, exact_match, write_log)
|
26
|
+
end
|
27
|
+
|
28
|
+
# define Index class
|
29
|
+
class Index
|
30
|
+
attr_accessor :name, :description, :input, :output, :log, :mapping, :keys, :logger
|
31
|
+
|
32
|
+
def initialize(h)
|
33
|
+
errors = []
|
34
|
+
|
35
|
+
# validate: h is a hash
|
36
|
+
raise "The parameter must be a hash." unless h.is_a?(Hash)
|
37
|
+
|
38
|
+
# validate: :name is present
|
39
|
+
errors << "The parameter :name is mandatory." unless h.has_key?(:name)
|
40
|
+
|
41
|
+
# validate: :name is a string
|
42
|
+
errors << "The parameter :name must be a string." unless h[:name].is_a?(String)
|
43
|
+
|
44
|
+
# validate: if :description is present, it is a string
|
45
|
+
errors << "The parameter :description must be a string." if h.has_key?(:description) && !h[:description].is_a?(String)
|
46
|
+
|
47
|
+
# validate: if :input is present, it is a string
|
48
|
+
errors << "The parameter :input must be a string." if h.has_key?(:input) && !h[:input].is_a?(String)
|
49
|
+
|
50
|
+
# validate: if :output is present, it is a string
|
51
|
+
errors << "The parameter :output must be a string." if h.has_key?(:output) && !h[:output].is_a?(String)
|
52
|
+
|
53
|
+
# validate: if :log is present, it is a string
|
54
|
+
errors << "The parameter :log must be a string." if h.has_key?(:log) && !h[:log].is_a?(String)
|
55
|
+
|
56
|
+
# validate: :mapping is present
|
57
|
+
errors << "The parameter :mapping is mandatory." unless h.has_key?(:mapping)
|
58
|
+
|
59
|
+
# validate: :mapping is a hash
|
60
|
+
errors << "The parameter :mapping must be a hash." unless h[:mapping].is_a?(Hash)
|
61
|
+
|
62
|
+
# validate: :keys is present
|
63
|
+
errors << "The parameter :keys is mandatory." unless h.has_key?(:keys)
|
64
|
+
|
65
|
+
# validate: :keys is an array
|
66
|
+
errors << "The parameter :keys must be an array." unless h[:keys].is_a?(Array)
|
67
|
+
|
68
|
+
# validate: :name is unique
|
69
|
+
errors << "The parameter :name must be unique." if BlackStack::CSVIndexer.indexes.map{|i| i.name}.include?(h[:name])
|
70
|
+
|
71
|
+
# if errors happened, raise an exception
|
72
|
+
raise "The following errors happened while creating the index: #{errors.join(', ')}" unless errors.empty?
|
73
|
+
|
74
|
+
# default value for :input
|
75
|
+
h[:input] = './*.csv' unless h.has_key?(:input)
|
76
|
+
|
77
|
+
# default value for :output
|
78
|
+
h[:output] = './' unless h.has_key?(:output)
|
79
|
+
|
80
|
+
# default value for :log
|
81
|
+
h[:log] = './' unless h.has_key?(:log)
|
82
|
+
|
83
|
+
# create the logger
|
84
|
+
self.logger = BlackStack::LocalLogger.new("#{h[:log]}/#{h[:name]}.log")
|
85
|
+
|
86
|
+
# set the attributes
|
87
|
+
self.name = h[:name]
|
88
|
+
self.description = h[:description]
|
89
|
+
self.input = h[:input]
|
90
|
+
self.output = h[:output]
|
91
|
+
self.log = h[:log]
|
92
|
+
self.mapping = h[:mapping]
|
93
|
+
self.keys = h[:keys]
|
94
|
+
end
|
95
|
+
|
96
|
+
# create the index file
|
97
|
+
def index(write_log=true)
|
98
|
+
# define the logger to use
|
99
|
+
l = write_log ? self.logger : BlackStack::DummyLogger.new
|
100
|
+
# output file extension
|
101
|
+
ext = ".#{self.name}"
|
102
|
+
# index the bites
|
103
|
+
Dir.glob(input).each do |file|
|
104
|
+
# get the name of the file from the full path
|
105
|
+
name = file.split('/').last
|
106
|
+
# get the path of the file from the full path
|
107
|
+
path = file.gsub("/#{name}", '')
|
108
|
+
# opening log line
|
109
|
+
l.logs "Indexing #{name}... "
|
110
|
+
# get the output filename
|
111
|
+
output_filename = "#{File.expand_path(self.output)}/#{name.gsub(/\.csv$/, ext)}"
|
112
|
+
# if output file exists, skip
|
113
|
+
if File.exists?(output_filename)
|
114
|
+
l.logf "skip"
|
115
|
+
else
|
116
|
+
# open the input file
|
117
|
+
input_file = File.open(file, 'r')
|
118
|
+
# import the bite to the database
|
119
|
+
i = 0
|
120
|
+
a = []
|
121
|
+
# iterate lines if input_file
|
122
|
+
input_file.each_line do |line|
|
123
|
+
i += 1
|
124
|
+
fields = []
|
125
|
+
key = []
|
126
|
+
# get the array of fields
|
127
|
+
row = CSV.parse_line(line)
|
128
|
+
# build the key
|
129
|
+
self.keys.each do |k|
|
130
|
+
colnum = self.mapping[k]
|
131
|
+
# replace '"' by empty string, and '|' with ','
|
132
|
+
key << row[colnum].gsub('"', '').gsub('|', ',')
|
133
|
+
end
|
134
|
+
key = "\"#{key.join('|')}\""
|
135
|
+
# add the key as the first field of the index line
|
136
|
+
fields << key
|
137
|
+
# add the row number as the second field of the index line
|
138
|
+
fields << "\"#{i.to_s}\""
|
139
|
+
# iterate the mapping
|
140
|
+
self.mapping.each do |k, v|
|
141
|
+
# get the data from the row
|
142
|
+
# format the field values for the CSV
|
143
|
+
fields << "\"#{row[v].gsub('"', '')}\""
|
144
|
+
end
|
145
|
+
# add fields to the array
|
146
|
+
a << fields
|
147
|
+
end
|
148
|
+
# sort the array
|
149
|
+
a.sort!
|
150
|
+
# get the output file
|
151
|
+
output_file = File.open(output_filename, 'w')
|
152
|
+
size = nil
|
153
|
+
new_size = nil
|
154
|
+
# write the array to the output file
|
155
|
+
a.each do |row|
|
156
|
+
# add the size of the line, in order to be able to do a binary search
|
157
|
+
line = row.join(',')
|
158
|
+
# add the size of the line as a last field of the row.
|
159
|
+
# this value is necessary to run the search.
|
160
|
+
size = line.size
|
161
|
+
new_size = size + 1 + 2 + size.to_s.size # 1 comma, 2 double-quotes, and size of the size
|
162
|
+
new_size += 1 if size.to_s.size < new_size.to_s.size # sum 1 if new_size had 1 more digit than size (e.g. 104 vs 99)
|
163
|
+
size = new_size
|
164
|
+
line += ",\"#{size.to_s}\""
|
165
|
+
output_file.puts line
|
166
|
+
end
|
167
|
+
# close the output file
|
168
|
+
output_file.close
|
169
|
+
# close log
|
170
|
+
l.done
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end # def index
|
174
|
+
|
175
|
+
# compare 2 keys.
|
176
|
+
# if !exact_match and if each value in key1 is included in the key2, return 0
|
177
|
+
# otherwise, return 0 if equal, -1 if key1 < key2, 1 if key1 > key2
|
178
|
+
# this method is used by the binary search.
|
179
|
+
# this method should not be used by the user.
|
180
|
+
#
|
181
|
+
# Example:
|
182
|
+
# compare_keys('Century 21', 'Century 21 LLC', false)
|
183
|
+
# => 0
|
184
|
+
#
|
185
|
+
# Example:
|
186
|
+
# compare_keys('Century 21', 'Century 21 LLC', true)
|
187
|
+
# => -1
|
188
|
+
#
|
189
|
+
def compare_keys(key1, key2, exact_match=true)
|
190
|
+
match = true
|
191
|
+
# get the keys as arrays
|
192
|
+
a1 = key1 #.split('|')
|
193
|
+
a2 = key2 #.split('|')
|
194
|
+
# validation: a2.size > a1.size
|
195
|
+
raise 'The key2 must has more elements than key1.' if a2.size < a1.size
|
196
|
+
# iterate the arrays
|
197
|
+
a2.each_with_index do |k, i|
|
198
|
+
match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
|
199
|
+
end
|
200
|
+
return 0 if match && !exact_match
|
201
|
+
# return the result
|
202
|
+
# iterate the arrays
|
203
|
+
a1.each_with_index do |k, i|
|
204
|
+
# if the keys are different, return the result
|
205
|
+
if k.upcase < a2[i].upcase
|
206
|
+
return 1
|
207
|
+
elsif k.upcase > a2[i].upcase
|
208
|
+
return -1
|
209
|
+
end
|
210
|
+
end
|
211
|
+
# if the keys are equal, return 0
|
212
|
+
return 0
|
213
|
+
end
|
214
|
+
|
215
|
+
# search the index.
|
216
|
+
# return a hash description with the matches, and a brief performance report.
|
217
|
+
def find(key, exact_match=true, write_log=false)
|
218
|
+
# if key is an string, convert it into an array of 1 element
|
219
|
+
key = [key] if key.is_a?(String)
|
220
|
+
|
221
|
+
# build the response.
|
222
|
+
ret = {
|
223
|
+
:matches => [],
|
224
|
+
}
|
225
|
+
|
226
|
+
# define the logger to use
|
227
|
+
l = write_log ? self.logger : BlackStack::DummyLogger.new
|
228
|
+
|
229
|
+
# define the source
|
230
|
+
source = "#{File.expand_path(self.output)}/*.#{self.name}"
|
231
|
+
|
232
|
+
# start time
|
233
|
+
start_time = Time.now
|
234
|
+
|
235
|
+
# totals
|
236
|
+
total_matches = 0
|
237
|
+
|
238
|
+
# searching in the indexed files
|
239
|
+
l.log "Search term: #{key.to_s}"
|
240
|
+
files = Dir.glob(source)
|
241
|
+
n = 0
|
242
|
+
files.each do |file|
|
243
|
+
# get the name of the file from the full path
|
244
|
+
name = file.split('/').last
|
245
|
+
# get the path of the file from the full path
|
246
|
+
path = file.gsub("/#{name}", '')
|
247
|
+
# opening log line
|
248
|
+
l.logs "Searching into #{name}... "
|
249
|
+
# setting boundaries for the binary search
|
250
|
+
i = 0
|
251
|
+
max = `wc -c #{file}`.split(' ').first.to_i
|
252
|
+
middle = ((i + max) / 2).to_i
|
253
|
+
# totals
|
254
|
+
# open file with random access
|
255
|
+
f = File.open(file, 'r')
|
256
|
+
# remember middle variable from the previous iteration
|
257
|
+
prev = -1
|
258
|
+
# binary search
|
259
|
+
while i<max
|
260
|
+
# get the middle of the file
|
261
|
+
middle = ((i + max) / 2).to_i
|
262
|
+
# break if the middle is the same as the previous iteration
|
263
|
+
break if middle==prev
|
264
|
+
# remember the middle in this iteration
|
265
|
+
prev = middle
|
266
|
+
# opening log line
|
267
|
+
l.logs "#{middle}... "
|
268
|
+
# go to the middle of the file
|
269
|
+
f.seek(middle)
|
270
|
+
# read the line
|
271
|
+
# the cursor is at the middle of a line
|
272
|
+
# so, I have to read a second line to get a full line
|
273
|
+
line = f.readline
|
274
|
+
# most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
|
275
|
+
a = line.split('","')
|
276
|
+
while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
|
277
|
+
middle -= 1
|
278
|
+
f.seek(middle)
|
279
|
+
line = f.readline
|
280
|
+
a = line.split('","')
|
281
|
+
end
|
282
|
+
line_size = a.last.gsub('"', '').to_i
|
283
|
+
middle -= line_size-line.size+1
|
284
|
+
# seek and readline again, to get the line from its begining
|
285
|
+
f.seek(middle)
|
286
|
+
line = f.readline
|
287
|
+
# strip the line
|
288
|
+
line.strip!
|
289
|
+
# get the first field of the CSV line
|
290
|
+
fields = CSV.parse_line(line)
|
291
|
+
row_key = fields[0].split('|')
|
292
|
+
# compare keys
|
293
|
+
x = compare_keys(key, row_key, exact_match)
|
294
|
+
# compare the first field with the search term
|
295
|
+
if x == 0
|
296
|
+
# found
|
297
|
+
l.logf "found (#{row_key})"
|
298
|
+
ret[:matches] << fields.dup
|
299
|
+
total_matches += 1
|
300
|
+
break
|
301
|
+
else
|
302
|
+
# not found
|
303
|
+
if x == 1
|
304
|
+
# search in the down half
|
305
|
+
max = middle
|
306
|
+
else #if x == -1
|
307
|
+
# search in the up half
|
308
|
+
i = middle + line.size+1
|
309
|
+
end
|
310
|
+
l.logf "not found (#{row_key})"
|
311
|
+
end
|
312
|
+
end
|
313
|
+
# closing the file
|
314
|
+
f.close
|
315
|
+
# closing the log line
|
316
|
+
l.done
|
317
|
+
# increment file counter
|
318
|
+
n += 1
|
319
|
+
end
|
320
|
+
|
321
|
+
end_time = Time.now
|
322
|
+
|
323
|
+
ret[:enlapsed_seconds] = end_time - start_time
|
324
|
+
ret[:lines_matched] = total_matches
|
325
|
+
|
326
|
+
l.log "Matches: #{total_matches.to_s}"
|
327
|
+
l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
|
328
|
+
|
329
|
+
ret
|
330
|
+
end # def find
|
331
|
+
end
|
332
|
+
end # module CSVIndexer
|
333
|
+
end # module BlackStack
|
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: csv-indexer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Leandro Daniel Sardi
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-11-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: csv
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 3.2.2
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 3.2.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 3.2.2
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 3.2.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: simple_cloud_logging
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 1.2.2
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 1.2.2
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 1.2.2
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.2.2
|
53
|
+
description: 'CSV Indexer makes it simple the indexation and searching in lasge CSV
|
54
|
+
files. It is not as robust as Lucence, but it is simple and cost-effective. May
|
55
|
+
index files with millions of rows and find specific rows in matter of seconds. Find
|
56
|
+
documentation here: https://github.com/leandrosardi/csv-indexer.'
|
57
|
+
email: leandro.sardi@expandedventure.com
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- lib/csv-indexer.rb
|
63
|
+
homepage: https://github.com/leandrosardi/csv-indexer
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata: {}
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubygems_version: 3.3.7
|
83
|
+
signing_key:
|
84
|
+
specification_version: 4
|
85
|
+
summary: CSV Indexer makes it simple the indexation and searching in lasge CSV files.
|
86
|
+
test_files: []
|