csv-indexer 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/csv-indexer.rb +333 -0
  3. metadata +86 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
4
+ data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
5
+ SHA512:
6
+ metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
7
+ data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'
@@ -0,0 +1,333 @@
1
+ require 'csv'
2
+ require 'simple_cloud_logging'
3
+
4
+ module BlackStack
5
+ module CSVIndexer
6
+ @indexes = []
7
+
8
+ def self.indexes
9
+ @indexes
10
+ end
11
+
12
+ def self.add_indexation(h)
13
+ @indexes << BlackStack::CSVIndexer::Index.new(h)
14
+ end
15
+
16
+ def self.index(name, write_log=true)
17
+ i = @indexes.select { |i| i.name = name }.first
18
+ raise 'Index not found.' if i.nil?
19
+ i.index(write_log)
20
+ end
21
+
22
+ def self.find(name, key, exact_match=true, write_log=false)
23
+ i = @indexes.select { |i| i.name = name }.first
24
+ raise 'Index not found.' if i.nil?
25
+ i.find(key, exact_match, write_log)
26
+ end
27
+
28
+ # define Index class
29
+ class Index
30
+ attr_accessor :name, :description, :input, :output, :log, :mapping, :keys, :logger
31
+
32
+ def initialize(h)
33
+ errors = []
34
+
35
+ # validate: h is a hash
36
+ raise "The parameter must be a hash." unless h.is_a?(Hash)
37
+
38
+ # validate: :name is present
39
+ errors << "The parameter :name is mandatory." unless h.has_key?(:name)
40
+
41
+ # validate: :name is a string
42
+ errors << "The parameter :name must be a string." unless h[:name].is_a?(String)
43
+
44
+ # validate: if :description is present, it is a string
45
+ errors << "The parameter :description must be a string." if h.has_key?(:description) && !h[:description].is_a?(String)
46
+
47
+ # validate: if :input is present, it is a string
48
+ errors << "The parameter :input must be a string." if h.has_key?(:input) && !h[:input].is_a?(String)
49
+
50
+ # validate: if :output is present, it is a string
51
+ errors << "The parameter :output must be a string." if h.has_key?(:output) && !h[:output].is_a?(String)
52
+
53
+ # validate: if :log is present, it is a string
54
+ errors << "The parameter :log must be a string." if h.has_key?(:log) && !h[:log].is_a?(String)
55
+
56
+ # validate: :mapping is present
57
+ errors << "The parameter :mapping is mandatory." unless h.has_key?(:mapping)
58
+
59
+ # validate: :mapping is a hash
60
+ errors << "The parameter :mapping must be a hash." unless h[:mapping].is_a?(Hash)
61
+
62
+ # validate: :keys is present
63
+ errors << "The parameter :keys is mandatory." unless h.has_key?(:keys)
64
+
65
+ # validate: :keys is an array
66
+ errors << "The parameter :keys must be an array." unless h[:keys].is_a?(Array)
67
+
68
+ # validate: :name is unique
69
+ errors << "The parameter :name must be unique." if BlackStack::CSVIndexer.indexes.map{|i| i.name}.include?(h[:name])
70
+
71
+ # if errors happened, raise an exception
72
+ raise "The following errors happened while creating the index: #{errors.join(', ')}" unless errors.empty?
73
+
74
+ # default value for :input
75
+ h[:input] = './*.csv' unless h.has_key?(:input)
76
+
77
+ # default value for :output
78
+ h[:output] = './' unless h.has_key?(:output)
79
+
80
+ # default value for :log
81
+ h[:log] = './' unless h.has_key?(:log)
82
+
83
+ # create the logger
84
+ self.logger = BlackStack::LocalLogger.new("#{h[:log]}/#{h[:name]}.log")
85
+
86
+ # set the attributes
87
+ self.name = h[:name]
88
+ self.description = h[:description]
89
+ self.input = h[:input]
90
+ self.output = h[:output]
91
+ self.log = h[:log]
92
+ self.mapping = h[:mapping]
93
+ self.keys = h[:keys]
94
+ end
95
+
96
+ # create the index file
97
+ def index(write_log=true)
98
+ # define the logger to use
99
+ l = write_log ? self.logger : BlackStack::DummyLogger.new
100
+ # output file extension
101
+ ext = ".#{self.name}"
102
+ # index the bites
103
+ Dir.glob(input).each do |file|
104
+ # get the name of the file from the full path
105
+ name = file.split('/').last
106
+ # get the path of the file from the full path
107
+ path = file.gsub("/#{name}", '')
108
+ # opening log line
109
+ l.logs "Indexing #{name}... "
110
+ # get the output filename
111
+ output_filename = "#{File.expand_path(self.output)}/#{name.gsub(/\.csv$/, ext)}"
112
+ # if output file exists, skip
113
+ if File.exists?(output_filename)
114
+ l.logf "skip"
115
+ else
116
+ # open the input file
117
+ input_file = File.open(file, 'r')
118
+ # import the bite to the database
119
+ i = 0
120
+ a = []
121
+ # iterate lines if input_file
122
+ input_file.each_line do |line|
123
+ i += 1
124
+ fields = []
125
+ key = []
126
+ # get the array of fields
127
+ row = CSV.parse_line(line)
128
+ # build the key
129
+ self.keys.each do |k|
130
+ colnum = self.mapping[k]
131
+ # replace '"' by empty string, and '|' with ','
132
+ key << row[colnum].gsub('"', '').gsub('|', ',')
133
+ end
134
+ key = "\"#{key.join('|')}\""
135
+ # add the key as the first field of the index line
136
+ fields << key
137
+ # add the row number as the second field of the index line
138
+ fields << "\"#{i.to_s}\""
139
+ # iterate the mapping
140
+ self.mapping.each do |k, v|
141
+ # get the data from the row
142
+ # format the field values for the CSV
143
+ fields << "\"#{row[v].gsub('"', '')}\""
144
+ end
145
+ # add fields to the array
146
+ a << fields
147
+ end
148
+ # sort the array
149
+ a.sort!
150
+ # get the output file
151
+ output_file = File.open(output_filename, 'w')
152
+ size = nil
153
+ new_size = nil
154
+ # write the array to the output file
155
+ a.each do |row|
156
+ # add the size of the line, in order to be able to do a binary search
157
+ line = row.join(',')
158
+ # add the size of the line as a last field of the row.
159
+ # this value is necessary to run the search.
160
+ size = line.size
161
+ new_size = size + 1 + 2 + size.to_s.size # 1 comma, 2 double-quotes, and size of the size
162
+ new_size += 1 if size.to_s.size < new_size.to_s.size # sum 1 if new_size had 1 more digit than size (e.g. 104 vs 99)
163
+ size = new_size
164
+ line += ",\"#{size.to_s}\""
165
+ output_file.puts line
166
+ end
167
+ # close the output file
168
+ output_file.close
169
+ # close log
170
+ l.done
171
+ end
172
+ end
173
+ end # def index
174
+
175
+ # compare 2 keys.
176
+ # if !exact_match and if each value in key1 is included in the key2, return 0
177
+ # otherwise, return 0 if equal, -1 if key1 < key2, 1 if key1 > key2
178
+ # this method is used by the binary search.
179
+ # this method should not be used by the user.
180
+ #
181
+ # Example:
182
+ # compare_keys('Century 21', 'Century 21 LLC', false)
183
+ # => 0
184
+ #
185
+ # Example:
186
+ # compare_keys('Century 21', 'Century 21 LLC', true)
187
+ # => -1
188
+ #
189
+ def compare_keys(key1, key2, exact_match=true)
190
+ match = true
191
+ # get the keys as arrays
192
+ a1 = key1 #.split('|')
193
+ a2 = key2 #.split('|')
194
+ # validation: a2.size > a1.size
195
+ raise 'The key2 must has more elements than key1.' if a2.size < a1.size
196
+ # iterate the arrays
197
+ a2.each_with_index do |k, i|
198
+ match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
199
+ end
200
+ return 0 if match && !exact_match
201
+ # return the result
202
+ # iterate the arrays
203
+ a1.each_with_index do |k, i|
204
+ # if the keys are different, return the result
205
+ if k.upcase < a2[i].upcase
206
+ return 1
207
+ elsif k.upcase > a2[i].upcase
208
+ return -1
209
+ end
210
+ end
211
+ # if the keys are equal, return 0
212
+ return 0
213
+ end
214
+
215
+ # search the index.
216
+ # return a hash description with the matches, and a brief performance report.
217
+ def find(key, exact_match=true, write_log=false)
218
+ # if key is an string, convert it into an array of 1 element
219
+ key = [key] if key.is_a?(String)
220
+
221
+ # build the response.
222
+ ret = {
223
+ :matches => [],
224
+ }
225
+
226
+ # define the logger to use
227
+ l = write_log ? self.logger : BlackStack::DummyLogger.new
228
+
229
+ # define the source
230
+ source = "#{File.expand_path(self.output)}/*.#{self.name}"
231
+
232
+ # start time
233
+ start_time = Time.now
234
+
235
+ # totals
236
+ total_matches = 0
237
+
238
+ # searching in the indexed files
239
+ l.log "Search term: #{key.to_s}"
240
+ files = Dir.glob(source)
241
+ n = 0
242
+ files.each do |file|
243
+ # get the name of the file from the full path
244
+ name = file.split('/').last
245
+ # get the path of the file from the full path
246
+ path = file.gsub("/#{name}", '')
247
+ # opening log line
248
+ l.logs "Searching into #{name}... "
249
+ # setting boundaries for the binary search
250
+ i = 0
251
+ max = `wc -c #{file}`.split(' ').first.to_i
252
+ middle = ((i + max) / 2).to_i
253
+ # totals
254
+ # open file with random access
255
+ f = File.open(file, 'r')
256
+ # remember middle variable from the previous iteration
257
+ prev = -1
258
+ # binary search
259
+ while i<max
260
+ # get the middle of the file
261
+ middle = ((i + max) / 2).to_i
262
+ # break if the middle is the same as the previous iteration
263
+ break if middle==prev
264
+ # remember the middle in this iteration
265
+ prev = middle
266
+ # opening log line
267
+ l.logs "#{middle}... "
268
+ # go to the middle of the file
269
+ f.seek(middle)
270
+ # read the line
271
+ # the cursor is at the middle of a line
272
+ # so, I have to read a second line to get a full line
273
+ line = f.readline
274
+ # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
275
+ a = line.split('","')
276
+ while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
277
+ middle -= 1
278
+ f.seek(middle)
279
+ line = f.readline
280
+ a = line.split('","')
281
+ end
282
+ line_size = a.last.gsub('"', '').to_i
283
+ middle -= line_size-line.size+1
284
+ # seek and readline again, to get the line from its begining
285
+ f.seek(middle)
286
+ line = f.readline
287
+ # strip the line
288
+ line.strip!
289
+ # get the first field of the CSV line
290
+ fields = CSV.parse_line(line)
291
+ row_key = fields[0].split('|')
292
+ # compare keys
293
+ x = compare_keys(key, row_key, exact_match)
294
+ # compare the first field with the search term
295
+ if x == 0
296
+ # found
297
+ l.logf "found (#{row_key})"
298
+ ret[:matches] << fields.dup
299
+ total_matches += 1
300
+ break
301
+ else
302
+ # not found
303
+ if x == 1
304
+ # search in the down half
305
+ max = middle
306
+ else #if x == -1
307
+ # search in the up half
308
+ i = middle + line.size+1
309
+ end
310
+ l.logf "not found (#{row_key})"
311
+ end
312
+ end
313
+ # closing the file
314
+ f.close
315
+ # closing the log line
316
+ l.done
317
+ # increment file counter
318
+ n += 1
319
+ end
320
+
321
+ end_time = Time.now
322
+
323
+ ret[:enlapsed_seconds] = end_time - start_time
324
+ ret[:lines_matched] = total_matches
325
+
326
+ l.log "Matches: #{total_matches.to_s}"
327
+ l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
328
+
329
+ ret
330
+ end # def find
331
+ end
332
+ end # module CSVIndexer
333
+ end # module BlackStack
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csv-indexer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Leandro Daniel Sardi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-11-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: csv
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 3.2.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 3.2.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 3.2.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 3.2.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: simple_cloud_logging
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 1.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 1.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 1.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.2.2
53
+ description: 'CSV Indexer makes it simple the indexation and searching in lasge CSV
54
+ files. It is not as robust as Lucence, but it is simple and cost-effective. May
55
+ index files with millions of rows and find specific rows in matter of seconds. Find
56
+ documentation here: https://github.com/leandrosardi/csv-indexer.'
57
+ email: leandro.sardi@expandedventure.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/csv-indexer.rb
63
+ homepage: https://github.com/leandrosardi/csv-indexer
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubygems_version: 3.3.7
83
+ signing_key:
84
+ specification_version: 4
85
+ summary: CSV Indexer makes it simple the indexation and searching in lasge CSV files.
86
+ test_files: []