RubyGems - csv-indexer - Versions diffs - 1.0.1 → 1.0.2 - Mend

csv-indexer 1.0.1 → 1.0.2

Files changed (3) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2a91e291c094278a60e60a898b31063afab3910ec6a01b81fcdd0451f61acdda
-  data.tar.gz: f80471808e5f317f6e1040a7c858f48d9e6f853fc5cb2263ab168b216ee17a2d
+  metadata.gz: 447d47db02c690a8bda0c1418a3373363c52f1e9295e962f006ddf5123f10c42
+  data.tar.gz: 0f0f2a0910b7e5d4128d0f132acc071091cad18166251f873593015a8f9bf1d8
 SHA512:
-  metadata.gz: efd9de2b0a242963542466b53045b07120722119e16c52de77b9c016058b2c684e99b779cd2d54ea49c1c2c10dfcf2fb95e0314d4504f1c7f1bb9dca7455cab8
-  data.tar.gz: '053588f0ae2c3e598c5039f8be10e9cededf4076393585c3a94235774ed11302bc9293b0b0ed58a524d8ecda581a8bbcf7226f689a53defc734ae07c01714e47'
+  metadata.gz: 541ed58833c51c0f89e5bc4d217fb5b28fef3613b97f12711bd710a1d9e8bce425a69b7574f229b88c1bcf2caae2ad7fbb4db76d155c54b214f543d723e18823
+  data.tar.gz: 5b68297cf8b08d43618dc6ef34474ca30988fae542fcdd4cdf8f729c77d416a878f265d37faaf7aa640a91f2b101e517171eb979d141163c272da1c7ea71cfcd

data/lib/csv-indexer.rb CHANGED Viewed

@@ -99,6 +99,7 @@ module BlackStack
                 l = write_log ? self.logger : BlackStack::DummyLogger.new
                 # output file extension
                 ext = ".#{self.name}"
                 # index the bites
                 Dir.glob(input).each do |file|
                     # get the name of the file from the full path
@@ -120,30 +121,34 @@ module BlackStack
                         a = []
                         # iterate lines if input_file
                         input_file.each_line do |line|
-                            i += 1
-                            fields = []
-                            key = []
-                            # get the array of fields
-                            row = CSV.parse_line(line)
-                            # build the key
-                            self.keys.each do |k|
-                                colnum = self.mapping[k]
-                                # replace '"' by empty string, and '|' with ','
-                                key << row[colnum].gsub('"', '').gsub('|', ',')
+                            begin
+                                i += 1
+                                fields = []
+                                key = []
+                                # get the array of fields
+                                row = CSV.parse_line(line)
+                                # build the key
+                                self.keys.each do |k|
+                                    colnum = self.mapping[k]
+                                    # replace '"' by empty string, and '|' with ','
+                                    key << row[colnum].gsub('"', '').gsub('|', ',')
+                                end
+                                key = "\"#{key.join('|')}\""
+                                # add the key as the first field of the index line
+                                fields << key
+                                # add the row number as the second field of the index line
+                                fields << "\"#{i.to_s}\""
+                                # iterate the mapping
+                                self.mapping.each do |k, v|
+                                    # get the data from the row
+                                    # format the field values for the CSV
+                                    fields << "\"#{row[v].to_s.gsub('"', '')}\""
+                                end
+                                # add fields to the array
+                                a << fields
+                            rescue => e
+                                # what to do with this?
                             end
-                            key = "\"#{key.join('|')}\""
-                            # add the key as the first field of the index line
-                            fields << key
-                            # add the row number as the second field of the index line
-                            fields << "\"#{i.to_s}\""
-                            # iterate the mapping
-                            self.mapping.each do |k, v|
-                                # get the data from the row
-                                # format the field values for the CSV
-                                fields << "\"#{row[v].gsub('"', '')}\""
-                            end
-                            # add fields to the array
-                            a << fields
                         end
                         # sort the array
                         a.sort!
@@ -193,9 +198,11 @@ module BlackStack
                 a2 = key2 #.split('|')
                 # validation: a2.size > a1.size
                 raise 'The key2 must has more elements than key1.' if a2.size < a1.size
+#binding.pry if a2[0].include?('anubhava-mishra-1b668124')
                 # iterate the arrays
                 a2.each_with_index do |k, i|
-                    match = false if k !~ /#{Regexp.escape(a1[i].to_s)}/i
+#binding.pry if k.include?('anubhava-mishra-1b668124')
+                    match = false if k !~ /^#{Regexp.escape(a1[i].to_s)}/i
                 end
                 return 0 if match && !exact_match
                 # return the result
@@ -248,7 +255,7 @@ module BlackStack
                     l.logs "Searching into #{name}... "
                     # setting boundaries for the binary search
                     i = 0
-                    max = `wc -c #{file}`.split(' ').first.to_i
+                    max = `wc -c '#{file}'`.split(' ').first.to_i
                     middle = ((i + max) / 2).to_i
                     # totals
                     # open file with random access
@@ -260,55 +267,74 @@ module BlackStack
                         # get the middle of the file
                         middle = ((i + max) / 2).to_i
                         # break if the middle is the same as the previous iteration
+#binding.pry if middle==prev
                         break if middle==prev
                         # remember the middle in this iteration
                         prev = middle
                         # opening log line
                         l.logs "#{middle}... "
-                        # go to the middle of the file
-                        f.seek(middle)
-                        # read the line
-                        # the cursor is at the middle of a line
-                        # so, I have to read a second line to get a full line
-                        line = f.readline
-                        # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
-                        a = line.split('","')
-                        while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
-                            middle -= 1
+                        # processing the line
+                        line = ''
+                        line_size = 0
+                        begin
+                            # go to the middle of the file
                             f.seek(middle)
-                            line = f.readline
+                            # read the line
+                            # the cursor is at the middle of a line
+                            # so, I have to read a second line to get a full line
+                            line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
+#binding.pry if line.include?('anubhav521@gmail.com')
+                            # most probably I landed in the midle of a line, so I have to get the size of the line where I landed.
                             a = line.split('","')
-                        end
-                        line_size = a.last.gsub('"', '').to_i
-                        middle -= line_size-line.size+1
-                        # seek and readline again, to get the line from its begining
-                        f.seek(middle)
-                        line = f.readline
-                        # strip the line
-                        line.strip!
-                        # get the first field of the CSV line
-                        fields = CSV.parse_line(line)
-                        row_key = fields[0].split('|')
-                        # compare keys
-                        x = compare_keys(key, row_key, exact_match)
-                        # compare the first field with the search term
-                        if x == 0
-                            # found
-                            l.logf "found (#{row_key})"
-                            ret[:matches] << fields.dup
-                            total_matches += 1
-                            break
-                        else
-                            # not found
-                            if x == 1
-                                # search in the down half
-                                max = middle
-                            else #if x == -1
-                                # search in the up half
-                                i = middle + line.size+1
+                            while a.size < 2 # this saves the situation when the cursor is inside the last field where I place the size of the line
+                                middle -= 1
+                                f.seek(middle)
+                                line = f.readline
+                                a = line.split('","')
                             end
-                            l.logf "not found (#{row_key})"
-                        end
+                            line_size = a.last.gsub('"', '').to_i
+                            middle -= line_size-line.size+1
+                            # seek and readline again, to get the line from its begining
+                            f.seek(middle)
+                            line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
+                            # BAD PRACTIVCE PATCH: sometimes the new value of middle (`middle -= line_size-line.size+1`) doesn't hit the starting of the line.
+                            while line[0] != '"'
+                                middle -= 1
+                                f.seek(middle)
+                                line = f.readline.encode('UTF-8', :undef => :replace, :invalid => :replace, :replace => " ")
+                            end
+                            # strip the line
+                            line.strip!
+                            # get the first field of the CSV line
+                            fields = CSV.parse_line(line)
+                            row_key = fields[0].split('|')
+                            # compare keys
+#binding.pry if line.include?('anubhava-mishra-1b668124')
+                            x = compare_keys(key, row_key, exact_match)
+                            # compare the first field with the search term
+                            if x == 0
+                                # found
+                                l.logf "found (#{row_key})"
+                                ret[:matches] << fields.dup
+                                total_matches += 1
+                                break
+                            else
+                                # not found
+                                if x == 1
+                                    # search in the down half
+                                    max = middle
+                                else #if x == -1
+                                    # search in the up half
+                                    i = middle + line.size+1
+                                end
+                                l.logf "not found (#{row_key})"
+                            end
+                        rescue => e
+                            l.logf "error in line `#{line}`: #{e.to_console}"
+                            # change the max, in order to don't repeat the same iteration and exit the loop in the line `break if middle==prev`
+                            #i+=1
+                            #max+=1
+                        end # begin
                     end
                     # closing the file
                     f.close
@@ -316,12 +342,16 @@ module BlackStack
                     l.done
                     # increment file counter
                     n += 1
+                    # tracing log
+                    l.log "i: #{i.to_s}"
+                    l.log "max: #{max.to_s}"
                 end
+                # end time
                 end_time = Time.now
                 ret[:enlapsed_seconds] = end_time - start_time
                 ret[:lines_matched] = total_matches
+                ret[:files_processed] = n
                 l.log "Matches: #{total_matches.to_s}"
                 l.log "Enlapsed seconds: #{ret[:enlapsed_seconds].to_s}"

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: csv-indexer
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.2
 platform: ruby
 authors:
 - Leandro Daniel Sardi