RubyGems - csv-indexer - Versions diffs - 1.0.1 - Mend

csv-indexer 1.0.1

Files changed (3) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
+  data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
+SHA512:
+  metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
+  data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'

data/lib/csv-indexer.rb ADDED Viewed

@@ -0,0 +1,333 @@
+require 'csv'
+require 'simple_cloud_logging'
+module BlackStack
+    module CSVIndexer
+        @indexes = []
+        def self.indexes
+            @indexes
+        end
+        def self.add_indexation(h)
+            @indexes << BlackStack::CSVIndexer::Index.new(h)
+        end
+        def self.index(name, write_log=true)
+            i = @indexes.select { |i| i.name = name }.first
+            raise 'Index not found.' if i.nil?
+            i.index(write_log)
+        end
+        def self.find(name, key, exact_match=true, write_log=false)
+            i = @indexes.select { |i| i.name = name }.first
+            raise 'Index not found.' if i.nil?
+            i.find(key, exact_match, write_log)
+        end
+        # define Index class
+        class Index
+            attr_accessor :name, :description, :input, :output, :log, :mapping, :keys, :logger
+            def initialize(h)
+                errors = []
+                # validate: h is a hash
+                raise "The parameter must be a hash." unless h.is_a?(Hash)
+                # validate: :name is present
+                errors << "The parameter :name is mandatory." unless h.has_key?(:name)
+                # validate: :name is a string
+                errors << "The parameter :name must be a string." unless h[:name].is_a?(String)
+                # validate: if :description is present, it is a string
+                errors << "The parameter :description must be a string." if h.has_key?(:description) && !h[:description].is_a?(String)
+                # validate: if :input is present, it is a string
+                errors << "The parameter :input must be a string." if h.has_key?(:input) && !h[:input].is_a?(String)
+                # validate: if :output is present, it is a string
+                errors << "The parameter :output must be a string." if h.has_key?(:output) && !h[:output].is_a?(String)
+                # validate: if :log is present, it is a string
+                errors << "The parameter :log must be a string." if h.has_key?(:log) && !h[:log].is_a?(String)
+                # validate: :mapping is present
+                errors << "The parameter :mapping is mandatory." unless h.has_key?(:mapping)
+                # validate: :mapping is a hash
+                errors << "The parameter :mapping must be a hash." unless h[:mapping].is_a?(Hash)
+                # validate: :keys is present
+                errors << "The parameter :keys is mandatory." unless h.has_key?(:keys)
+                # validate: :keys is an array
+                errors << "The parameter :keys must be an array." unless h[:keys].is_a?(Array)
+                # validate: :name is unique
+                errors << "The parameter :name must be unique." if BlackStack::CSVIndexer.indexes.map{|i| i.name}.include?(h[:name])
+                # if errors happened, raise an exception
+                raise "The following errors happened while creating the index: #{errors.join(', ')}" unless errors.empty?
+                # default value for :input
+                h[:input] = './*.csv' unless h.has_key?(:input)
+                # default value for :output
+                h[:output] = './' unless h.has_key?(:output)
+                # default value for :log
+                h[:log] = './' unless h.has_key?(:log)
+                # create the logger
+                self.logger = BlackStack::LocalLogger.new("#{h[:log]}/#{h[:name]}.log")
+                # set the attributes
+                self.name = h[:name]
+                self.description = h[:description]
+                self.input = h[:input]
+                self.output = h[:output]
+                self.log = h[:log]
+                self.mapping = h[:mapping]
+                self.keys = h[:keys]
+            end
+            # create the index file
+            def index(write_log=true)
+                # define the logger to use
+                l = write_log ? self.logger : BlackStack::DummyLogger.new
+                # output file extension
+                ext = ".#{self.name}"
+                # index the bites
+                Dir.glob(input).each do |file|
+                    # get the name of the file from the full path
+                    name = file.split('/').last
+                    # get the path of the file from the full path
+                    path = file.gsub("/#{name}", '')
+                    # opening log line
+                    l.logs "Indexing #{name}... "
+                    # get the output filename
+                    output_filename = "#{File.expand_path(self.output)}/#{name.gsub(/\.csv$/, ext)}"
+                    # if output file exists, skip
+                    if File.exists?(output_filename)
+                        l.logf "skip"
+                    else
+                        # open the input file
+                        input_file = File.open(file, 'r')
+                        # import the bite to the database
+                        i = 0
+                        a = []
+                        # iterate lines if input_file
+                        input_file.each_line do |line|
+                            i += 1
+                            fields = []
+                            key = []
+                            # get the array of fields
+                            row = CSV.parse_line(line)
+                            # build the key
+                            self.keys.each do |k|
+                                colnum = self.mapping[k]
+                                # replace '"' by empty string, and '|' with ','
+                                key << row[colnum].gsub('"', '').gsub('|', ',')
+                            end
+                            key = "\"#{key.join('|')}\""
+                            # add the key as the first field of the index line
+                            fields << key
+                            # add the row number as the second field of the index line
+                            fields << "\"#{i.to_s}\""
+                            # iterate the mapping
+                            self.mapping.each do |k, v|
+                                # get the data from the row
+                                # format the field values for the CSV
+                                fields << "\"#{row[v].gsub('"', '')}\""
+                            end
+                            # add fields to the array
+                            a << fields
+                        end
+                        # sort the array
+                        a.sort!
+                        # get the output file
+                        output_file = File.open(output_filename, 'w')
+                        size = nil
+                        new_size = nil
+                        # write the array to the output file
+                        a.each do |row|
+                            # add the size of the line, in order to be able to do a binary search
+                            line = row.join(',')
+                            # add the size of the line as a last field of the row.
+                            # this value is necessary to run the search.
+                            size = line.size
+                            new_size = size + 1 + 2 + size.to_s.size # 1 comma, 2 double-quotes, and size of the size
+                            new_size += 1 if size.to_s.size < new_size.to_s.size # sum 1 if new_size had 1 more digit than size (e.g. 104 vs 99)
+                            size = new_size
+                            line += ",\"#{size.to_s}\""
+                            output_file.puts line
+                        end
+                        # close the output file
+                        output_file.close
+                        # close log
+                        l.done
+                    end
+                end
+            end # def index
+            # compare 2 keys.
+            # if !exact_match and if each value in key1 is included in the key2, return 0
+            # otherwise, return 0 if equal, -1 if key1 < key2, 1 if key1 > key2
+            # this method is used by the binary search.
+            # this method should not be used by the user.
+            #
+            # Example:
+            # compare_keys('Century 21', 'Century 21 LLC', false)
+            #  => 0
+            #
+            # Example:
+            # compare_keys('Century 21', 'Century 21 LLC', true)
+            #  => -1
+            #
+            def compare_keys(key1, key2, exact_match=true)
+                match = true
+                # get the keys as arrays
+                a1 = key1 #.split('|')
+                a2 = key2 #.split('|')
+                # validation: a2.size > a1.size
+                raise 'The key2 must has more elements than key1.' if a2.size < a1.size
+                # iterate the arrays
+                a2.each_with_index do |k, i|
+                    match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
+                end
+                return 0 if match && !exact_match
+                # return the result
+                # iterate the arrays
+                a1.each_with_index do |k, i|
+                    # if the keys are different, return the result
+                    if k.upcase < a2[i].upcase
+                        return 1
+                    elsif k.upcase > a2[i].upcase
+                        return -1
+                    end
+                end
+                # if the keys are equal, return 0
+                return 0
+            end
+            # search the index.
+            # return a hash description with the matches, and a brief performance report.
+            def find(key, exact_match=true, write_log=false)
+                # if key is an string, convert it into an array of 1 element
+                key = [key] if key.is_a?(String)
+                # build the response.
+                ret = {
+                    :matches => [],
+                }
+                # define the logger to use
+                l = write_log ? self.logger : BlackStack::DummyLogger.new
+                # define the source
+                source = "#{File.expand_path(self.output)}/*.#{self.name}"
+                # start time
+                start_time = Time.now
+                # totals
+                total_matches = 0
+                # searching in the indexed files
+                l.log "Search term: #{key.to_s}"
+                files = Dir.glob(source)
+                n = 0
+                files.each do |file|
+                    # get the name of the file from the full path
+                    name = file.split('/').last
+                    # get the path of the file from the full path
+                    path = file.gsub("/#{name}", '')
+                    # opening log line
+                    l.logs "Searching into #{name}... "
+                    # setting boundaries for the binary search
+                    i = 0
+                    max = `wc -c #{file}`.split(' ').first.to_i
+                    middle = ((i + max) / 2).to_i
+                    # totals
+                    # open file with random access
+                    f = File.open(file, 'r')
+                    # remember middle variable from the previous iteration
+                    prev = -1
+                    # binary search
+                    while i<max
+                        # get the middle of the file
+                        middle = ((i + max) / 2).to_i
+                        # break if the middle is the same as the previous iteration
+                        break if middle==prev
+                        # remember the middle in this iteration
+                        prev = middle
+                        # opening log line
+                        l.logs "#{middle}... "
+                        # go to the middle of the file
+                        f.seek(middle)
+                        # read the line
+                        # the cursor is at the middle of a line
+                        # so, I have to read a second line to get a full line
+                        line = f.readline
+                        # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
+                        a = line.split('","')
+                        while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
+                            middle -= 1
+                            f.seek(middle)
+                            line = f.readline
+                            a = line.split('","')
+                        end
+                        line_size = a.last.gsub('"', '').to_i
+                        middle -= line_size-line.size+1
+                        # seek and readline again, to get the line from its begining
+                        f.seek(middle)
+                        line = f.readline
+                        # strip the line
+                        line.strip!
+                        # get the first field of the CSV line
+                        fields = CSV.parse_line(line)
+                        row_key = fields[0].split('|')
+                        # compare keys
+                        x = compare_keys(key, row_key, exact_match)
+                        # compare the first field with the search term
+                        if x == 0
+                            # found
+                            l.logf "found (#{row_key})"
+                            ret[:matches] << fields.dup
+                            total_matches += 1
+                            break
+                        else
+                            # not found
+                            if x == 1
+                                # search in the down half
+                                max = middle
+                            else #if x == -1
+                                # search in the up half
+                                i = middle + line.size+1
+                            end
+                            l.logf "not found (#{row_key})"
+                        end
+                    end
+                    # closing the file
+                    f.close
+                    # closing the log line
+                    l.done
+                    # increment file counter
+                    n += 1
+                end
+                end_time = Time.now
+                ret[:enlapsed_seconds] = end_time - start_time
+                ret[:lines_matched] = total_matches
+                l.log "Matches: #{total_matches.to_s}"
+                l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"
+                ret
+            end # def find
+        end
+    end # module CSVIndexer
+end # module BlackStack

metadata ADDED Viewed

@@ -0,0 +1,86 @@
+--- !ruby/object:Gem::Specification
+name: csv-indexer
+version: !ruby/object:Gem::Version
+  version: 1.0.1
+platform: ruby
+authors:
+- Leandro Daniel Sardi
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2022-11-08 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: csv
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 3.2.2
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 3.2.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 3.2.2
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 3.2.2
+- !ruby/object:Gem::Dependency
+  name: simple_cloud_logging
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.2.2
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.2.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 1.2.2
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.2.2
+description: 'CSV Indexer makes it simple the indexation and searching in lasge CSV
+  files. It is not as robust as Lucence, but it is simple and cost-effective. May
+  index files with millions of rows and find specific rows in matter of seconds. Find
+  documentation here: https://github.com/leandrosardi/csv-indexer.'
+email: leandro.sardi@expandedventure.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/csv-indexer.rb
+homepage: https://github.com/leandrosardi/csv-indexer
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.3.7
+signing_key:
+specification_version: 4
+summary: CSV Indexer makes it simple the indexation and searching in lasge CSV files.
+test_files: []