csv-indexer 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/csv-indexer.rb +333 -0
  3. metadata +86 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
4
+ data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
5
+ SHA512:
6
+ metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
7
+ data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'
@@ -0,0 +1,333 @@
1
+ require 'csv'
2
+ require 'simple_cloud_logging'
3
+
4
+ module BlackStack
5
+ module CSVIndexer
6
+ @indexes = []
7
+
8
+ def self.indexes
9
+ @indexes
10
+ end
11
+
12
+ def self.add_indexation(h)
13
+ @indexes << BlackStack::CSVIndexer::Index.new(h)
14
+ end
15
+
16
+ def self.index(name, write_log=true)
17
+ i = @indexes.select { |i| i.name = name }.first
18
+ raise 'Index not found.' if i.nil?
19
+ i.index(write_log)
20
+ end
21
+
22
+ def self.find(name, key, exact_match=true, write_log=false)
23
+ i = @indexes.select { |i| i.name = name }.first
24
+ raise 'Index not found.' if i.nil?
25
+ i.find(key, exact_match, write_log)
26
+ end
27
+
28
+ # define Index class
29
+ class Index
30
+ attr_accessor :name, :description, :input, :output, :log, :mapping, :keys, :logger
31
+
32
+ def initialize(h)
33
+ errors = []
34
+
35
+ # validate: h is a hash
36
+ raise "The parameter must be a hash." unless h.is_a?(Hash)
37
+
38
+ # validate: :name is present
39
+ errors << "The parameter :name is mandatory." unless h.has_key?(:name)
40
+
41
+ # validate: :name is a string
42
+ errors << "The parameter :name must be a string." unless h[:name].is_a?(String)
43
+
44
+ # validate: if :description is present, it is a string
45
+ errors << "The parameter :description must be a string." if h.has_key?(:description) && !h[:description].is_a?(String)
46
+
47
+ # validate: if :input is present, it is a string
48
+ errors << "The parameter :input must be a string." if h.has_key?(:input) && !h[:input].is_a?(String)
49
+
50
+ # validate: if :output is present, it is a string
51
+ errors << "The parameter :output must be a string." if h.has_key?(:output) && !h[:output].is_a?(String)
52
+
53
+ # validate: if :log is present, it is a string
54
+ errors << "The parameter :log must be a string." if h.has_key?(:log) && !h[:log].is_a?(String)
55
+
56
+ # validate: :mapping is present
57
+ errors << "The parameter :mapping is mandatory." unless h.has_key?(:mapping)
58
+
59
+ # validate: :mapping is a hash
60
+ errors << "The parameter :mapping must be a hash." unless h[:mapping].is_a?(Hash)
61
+
62
+ # validate: :keys is present
63
+ errors << "The parameter :keys is mandatory." unless h.has_key?(:keys)
64
+
65
+ # validate: :keys is an array
66
+ errors << "The parameter :keys must be an array." unless h[:keys].is_a?(Array)
67
+
68
+ # validate: :name is unique
69
+ errors << "The parameter :name must be unique." if BlackStack::CSVIndexer.indexes.map{|i| i.name}.include?(h[:name])
70
+
71
+ # if errors happened, raise an exception
72
+ raise "The following errors happened while creating the index: #{errors.join(', ')}" unless errors.empty?
73
+
74
+ # default value for :input
75
+ h[:input] = './*.csv' unless h.has_key?(:input)
76
+
77
+ # default value for :output
78
+ h[:output] = './' unless h.has_key?(:output)
79
+
80
+ # default value for :log
81
+ h[:log] = './' unless h.has_key?(:log)
82
+
83
+ # create the logger
84
+ self.logger = BlackStack::LocalLogger.new("#{h[:log]}/#{h[:name]}.log")
85
+
86
+ # set the attributes
87
+ self.name = h[:name]
88
+ self.description = h[:description]
89
+ self.input = h[:input]
90
+ self.output = h[:output]
91
+ self.log = h[:log]
92
+ self.mapping = h[:mapping]
93
+ self.keys = h[:keys]
94
+ end
95
+
96
+ # create the index file
97
+ def index(write_log=true)
98
+ # define the logger to use
99
+ l = write_log ? self.logger : BlackStack::DummyLogger.new
100
+ # output file extension
101
+ ext = ".#{self.name}"
102
+ # index the bites
103
+ Dir.glob(input).each do |file|
104
+ # get the name of the file from the full path
105
+ name = file.split('/').last
106
+ # get the path of the file from the full path
107
+ path = file.gsub("/#{name}", '')
108
+ # opening log line
109
+ l.logs "Indexing #{name}... "
110
+ # get the output filename
111
+ output_filename = "#{File.expand_path(self.output)}/#{name.gsub(/\.csv$/, ext)}"
112
+ # if output file exists, skip
113
+ if File.exists?(output_filename)
114
+ l.logf "skip"
115
+ else
116
+ # open the input file
117
+ input_file = File.open(file, 'r')
118
+ # import the bite to the database
119
+ i = 0
120
+ a = []
121
+ # iterate lines if input_file
122
+ input_file.each_line do |line|
123
+ i += 1
124
+ fields = []
125
+ key = []
126
+ # get the array of fields
127
+ row = CSV.parse_line(line)
128
+ # build the key
129
+ self.keys.each do |k|
130
+ colnum = self.mapping[k]
131
+ # replace '"' by empty string, and '|' with ','
132
+ key << row[colnum].gsub('"', '').gsub('|', ',')
133
+ end
134
+ key = "\"#{key.join('|')}\""
135
+ # add the key as the first field of the index line
136
+ fields << key
137
+ # add the row number as the second field of the index line
138
+ fields << "\"#{i.to_s}\""
139
+ # iterate the mapping
140
+ self.mapping.each do |k, v|
141
+ # get the data from the row
142
+ # format the field values for the CSV
143
+ fields << "\"#{row[v].gsub('"', '')}\""
144
+ end
145
+ # add fields to the array
146
+ a << fields
147
+ end
148
+ # sort the array
149
+ a.sort!
150
+ # get the output file
151
+ output_file = File.open(output_filename, 'w')
152
+ size = nil
153
+ new_size = nil
154
+ # write the array to the output file
155
+ a.each do |row|
156
+ # add the size of the line, in order to be able to do a binary search
157
+ line = row.join(',')
158
+ # add the size of the line as a last field of the row.
159
+ # this value is necessary to run the search.
160
+ size = line.size
161
+ new_size = size + 1 + 2 + size.to_s.size # 1 comma, 2 double-quotes, and size of the size
162
+ new_size += 1 if size.to_s.size < new_size.to_s.size # sum 1 if new_size had 1 more digit than size (e.g. 104 vs 99)
163
+ size = new_size
164
+ line += ",\"#{size.to_s}\""
165
+ output_file.puts line
166
+ end
167
+ # close the output file
168
+ output_file.close
169
+ # close log
170
+ l.done
171
+ end
172
+ end
173
+ end # def index
174
+
175
+ # compare 2 keys.
176
+ # if !exact_match and if each value in key1 is included in the key2, return 0
177
+ # otherwise, return 0 if equal, -1 if key1 < key2, 1 if key1 > key2
178
+ # this method is used by the binary search.
179
+ # this method should not be used by the user.
180
+ #
181
+ # Example:
182
+ # compare_keys('Century 21', 'Century 21 LLC', false)
183
+ # => 0
184
+ #
185
+ # Example:
186
+ # compare_keys('Century 21', 'Century 21 LLC', true)
187
+ # => -1
188
+ #
189
+ def compare_keys(key1, key2, exact_match=true)
190
+ match = true
191
+ # get the keys as arrays
192
+ a1 = key1 #.split('|')
193
+ a2 = key2 #.split('|')
194
+ # validation: a2.size > a1.size
195
+ raise 'The key2 must has more elements than key1.' if a2.size < a1.size
196
+ # iterate the arrays
197
+ a2.each_with_index do |k, i|
198
+ match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
199
+ end
200
+ return 0 if match && !exact_match
201
+ # return the result
202
+ # iterate the arrays
203
+ a1.each_with_index do |k, i|
204
+ # if the keys are different, return the result
205
+ if k.upcase < a2[i].upcase
206
+ return 1
207
+ elsif k.upcase > a2[i].upcase
208
+ return -1
209
+ end
210
+ end
211
+ # if the keys are equal, return 0
212
+ return 0
213
+ end
214
+
215
+ # search the index.
216
+ # return a hash description with the matches, and a brief performance report.
217
+ def find(key, exact_match=true, write_log=false)
218
+ # if key is an string, convert it into an array of 1 element
219
+ key = [key] if key.is_a?(String)
220
+
221
+ # build the response.
222
+ ret = {
223
+ :matches => [],
224
+ }
225
+
226
+ # define the logger to use
227
+ l = write_log ? self.logger : BlackStack::DummyLogger.new
228
+
229
+ # define the source
230
+ source = "#{File.expand_path(self.output)}/*.#{self.name}"
231
+
232
+ # start time
233
+ start_time = Time.now
234
+
235
+ # totals
236
+ total_matches = 0
237
+
238
+ # searching in the indexed files
239
+ l.log "Search term: #{key.to_s}"
240
+ files = Dir.glob(source)
241
+ n = 0
242
+ files.each do |file|
243
+ # get the name of the file from the full path
244
+ name = file.split('/').last
245
+ # get the path of the file from the full path
246
+ path = file.gsub("/#{name}", '')
247
+ # opening log line
248
+ l.logs "Searching into #{name}... "
249
+ # setting boundaries for the binary search
250
+ i = 0
251
+ max = `wc -c #{file}`.split(' ').first.to_i
252
+ middle = ((i + max) / 2).to_i
253
+ # totals
254
+ # open file with random access
255
+ f = File.open(file, 'r')
256
+ # remember middle variable from the previous iteration
257
+ prev = -1
258
+ # binary search
259
+ while i<max
260
+ # get the middle of the file
261
+ middle = ((i + max) / 2).to_i
262
+ # break if the middle is the same as the previous iteration
263
+ break if middle==prev
264
+ # remember the middle in this iteration
265
+ prev = middle
266
+ # opening log line
267
+ l.logs "#{middle}... "
268
+ # go to the middle of the file
269
+ f.seek(middle)
270
+ # read the line
271
+ # the cursor is at the middle of a line
272
+ # so, I have to read a second line to get a full line
273
+ line = f.readline
274
+ # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
275
+ a = line.split('","')
276
+ while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
277
+ middle -= 1
278
+ f.seek(middle)
279
+ line = f.readline
280
+ a = line.split('","')
281
+ end
282
+ line_size = a.last.gsub('"', '').to_i
283
+ middle -= line_size-line.size+1
284
+ # seek and readline again, to get the line from its begining
285
+ f.seek(middle)
286
+ line = f.readline
287
+ # strip the line
288
+ line.strip!
289
+ # get the first field of the CSV line
290
+ fields = CSV.parse_line(line)
291
+ row_key = fields[0].split('|')
292
+ # compare keys
293
+ x = compare_keys(key, row_key, exact_match)
294
+ # compare the first field with the search term
295
+ if x == 0
296
+ # found
297
+ l.logf "found (#{row_key})"
298
+ ret[:matches] << fields.dup
299
+ total_matches += 1
300
+ break
301
+ else
302
+ # not found
303
+ if x == 1
304
+ # search in the down half
305
+ max = middle
306
+ else #if x == -1
307
+ # search in the up half
308
+ i = middle + line.size+1
309
+ end
310
+ l.logf "not found (#{row_key})"
311
+ end
312
+ end
313
+ # closing the file
314
+ f.close
315
+ # closing the log line
316
+ l.done
317
+ # increment file counter
318
+ n += 1
319
+ end
320
+
321
+ end_time = Time.now
322
+
323
+ ret[:enlapsed_seconds] = end_time - start_time
324
+ ret[:lines_matched] = total_matches
325
+
326
+ l.log "Matches: #{total_matches.to_s}"
327
+ l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
328
+
329
+ ret
330
+ end # def find
331
+ end
332
+ end # module CSVIndexer
333
+ end # module BlackStack
metadata ADDED
@@ -0,0 +1,86 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: csv-indexer
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Leandro Daniel Sardi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2022-11-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: csv
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 3.2.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 3.2.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 3.2.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 3.2.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: simple_cloud_logging
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 1.2.2
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 1.2.2
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 1.2.2
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 1.2.2
53
+ description: 'CSV Indexer makes it simple the indexation and searching in lasge CSV
54
+ files. It is not as robust as Lucence, but it is simple and cost-effective. May
55
+ index files with millions of rows and find specific rows in matter of seconds. Find
56
+ documentation here: https://github.com/leandrosardi/csv-indexer.'
57
+ email: leandro.sardi@expandedventure.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - lib/csv-indexer.rb
63
+ homepage: https://github.com/leandrosardi/csv-indexer
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubygems_version: 3.3.7
83
+ signing_key:
84
+ specification_version: 4
85
+ summary: CSV Indexer makes it simple the indexation and searching in lasge CSV files.
86
+ test_files: []