RubyGems - eco-helpers - Versions diffs - 2.0.19 → 2.0.21 - Mend

eco-helpers 2.0.19 → 2.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -1
data/eco-helpers.gemspec +5 -1
data/lib/eco/api/common/loaders/parser.rb +1 -0
data/lib/eco/api/common/people/entries.rb +1 -0
data/lib/eco/api/common/people/entry_factory.rb +49 -15
data/lib/eco/api/common/version_patches/exception.rb +5 -2
data/lib/eco/api/organization/people.rb +2 -2
data/lib/eco/api/organization/people_similarity.rb +171 -11
data/lib/eco/api/organization/tag_tree.rb +33 -0
data/lib/eco/api/session.rb +4 -2
data/lib/eco/api/usecases/default_cases.rb +1 -0
data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +189 -19
data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
data/lib/eco/cli/config/default/options.rb +29 -1
data/lib/eco/cli/config/default/people.rb +18 -24
data/lib/eco/cli/config/default/usecases.rb +31 -2
data/lib/eco/cli/config/default/workflow.rb +7 -5
data/lib/eco/csv/table.rb +121 -21
data/lib/eco/data/fuzzy_match.rb +52 -12
data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
data/lib/eco/data/fuzzy_match/ngrams_score.rb +13 -9
data/lib/eco/data/fuzzy_match/pairing.rb +12 -18
data/lib/eco/data/fuzzy_match/result.rb +15 -1
data/lib/eco/data/fuzzy_match/results.rb +18 -0
data/lib/eco/data/fuzzy_match/score.rb +12 -7
data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
data/lib/eco/version.rb +1 -1
metadata +83 -2

data/lib/eco/cli/config/default/workflow.rb CHANGED Viewed

@@ -53,8 +53,7 @@ ASSETS.cli.config do |config|
           cases_with_people = config.usecases.active(io: io).select do |usecase, data|
             io.class.people_required?(usecase.type)
           end
-          get_people = io.options.dig(:people, :get, :from) == :remote
-          next io unless !cases_with_people.empty? || get_people
+          next io if cases_with_people.empty? && !io.options.dig(:people, :get)
           io = io.new(people:  config.people(io: io))
         end
@@ -67,7 +66,8 @@ ASSETS.cli.config do |config|
     wf.before(:usecases) do |wf_cases, io|
       # save partial entries -> should be native to session.workflow
-      partial_update = io.options.dig(:people, :get, :type) == :partial
+      get_people     = io.options.dig(:people, :get)
+      partial_update = get_people && get_people.dig(:type) == :partial
       if !io.options[:dry_run] && partial_update
         partial_file = io.session.config.people.partial_cache
         io.session.file_manager.save_json(io.people, partial_file, :timestamp)
@@ -98,7 +98,8 @@ ASSETS.cli.config do |config|
       if io.session.post_launch.empty?
         wf_post.skip!
       else
-        partial_update = io.options.dig(:people, :get, :type) == :partial
+        get_people     = io.options.dig(:people, :get)
+        partial_update = get_people && get_people.dig(:type) == :partial
         if !io.options[:dry_run] && partial_update
           # get target people afresh
           people = io.session.micro.people_refresh(people: io.people, include_created: true)
@@ -139,7 +140,8 @@ ASSETS.cli.config do |config|
     end
     wf.on(:end) do |wf_end, io|
-      partial_update = io.options.dig(:people, :get, :type) == :partial
+      get_people     = io.options.dig(:people, :get)
+      partial_update = get_people && get_people.dig(:type) == :partial
       unless !io.options[:end_get] || io.options[:dry_run] || partial_update
         people = io.session.micro.people_cache
         io     = io.new(people: people)

data/lib/eco/csv/table.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 module Eco
   class CSV
     class Table < ::CSV::Table
@@ -9,6 +8,70 @@ module Eco
         super(to_rows_array(input))
       end
+      # @return [Hash] where keys are the groups and the values a `Eco::CSV::Table`
+      def group_by(&block)
+        rows.group_by(&block).transform_values do |rows|
+          self.class.new(rows)
+        end
+      end
+      # @return [Eco::CSV::Table]
+      def transform_values
+        transformed_rows = rows.map do |row|
+          res = yield(row)
+          case res
+          when Array
+            ::CSV::Row.new(row.headers, res)
+          when ::CSV::Row
+            res
+          end
+        end
+        self.class.new(transformed_rows)
+      end
+      # Slices the selected rows
+      # @return [Eco::CSV::Table]
+      def slice(*index)
+        case index.first
+        when Range, Numeric
+          self.class.new(rows.slice(index.first))
+        else
+          self
+        end
+      end
+      # @return [Eco::CSV::Table]
+      def slice_columns(*index)
+        case index.first
+        when Range, Numeric
+          columns_to_table(columns.slice(index.first))
+        when String
+          csv_cols = columns
+          csv_cols = index.each_with_object([]) do |name, cols|
+            col = csv_cols.find {|col| col.first == name}
+            cols << col if col
+          end
+          columns_to_table(csv_cols)
+        else
+          self
+        end
+      end
+      # @return [Eco::CSV::Table]
+      def delete_column(i)
+        csv_cols = columns
+        csv_cols.delete(i)
+        columns_to_table(csv_cols)
+      end
+      # Adds a new column at the end
+      # @param header_name [String] header of the new column
+      # @return [Eco::CSV::Table] with a new empty column
+      def add_column(header_name)
+        new_col = Array.new(length).unshift(header_name)
+        columns_to_table(columns.push(new_col))
+      end
       # @return [Array<::CSV::Row>]
       def rows
         [].tap do |out|
@@ -16,24 +79,40 @@ module Eco
         end
       end
+      # It removes all rows where all columns' values are the same
+      def delete_duplicates!
+        unique_rows = []
+        self.by_row!.delete_if do |row|
+          unique_rows.any? {|done| equal_rows?(row, done)}.tap do |found|
+            unique_rows << row unless found
+          end
+        end
+      end
+      # @param row1 [CSV:Row] row to be compared
+      # @param row2 [CSV:Row] row to be compared
+      # @param [Boolean] `true` if all values of `row1` are as of `row2`
+      def equal_rows?(row1, row2)
+        row1.fields.zip(row2.fields).all? do |(v1, v2)|
+          v1 == v2
+        end
+      end
       # @return [Integer] total number of rows not including the header
       def length
         to_a.length - 1
       end
+      def empty?
+        length < 1
+      end
       # @return [Array<Array>] each array is the column header followed by its values
       def columns
         to_a.transpose
       end
-      # Adds a new column at the end
-      # @param header_name [String] header of the new column
-      # @return [Eco::CSV::Table] with a new empty column
-      def add_column(header_name)
-        new_col = Array.new(length).unshift(header_name)
-        columns_to_table(columns.push(new_col))
-      end
+      # Creates a single `Hash` where each key, value is a column (header + values)
       # @note it will override columns with same header name
       # @return [Hash] keys are headers, values are arrays
       def columns_hash
@@ -42,6 +121,17 @@ module Eco
         end.to_h
       end
+      # Returns an array of row hashes
+      # @note it will override columns with same header
+      def to_a_h
+        rows.map(&:to_h)
+      end
+      # @see #to_a_h
+      def to_array_of_hashes
+        to_a_h
+      end
       private
       def columns_to_table(columns_array)
@@ -51,24 +141,34 @@ module Eco
       def to_rows_array(data)
         case data
-        when Array
-          return data unless data.length > 0
-          if data.first.is_a?(::CSV::Row)
-            data
-          elsif data.first.is_a?(Array)
-            headers  = data.shift
-            data.map do |arr_row|
-              CSV::Row.new(headers, arr_row)
-            end.compact
-          else
-            raise "Expected data that can be transformed into Array<Array>"
-          end
         when ::CSV::Table
           to_rows_array(data.to_a)
         when Hash
           # hash of columns header as key and column array as value
           rows_arrays = [a.keys].concat(a.values.first.zip(*a.values[1..-1]))
           to_rows_array(data.keys)
+        when Enumerable
+          data = data.dup.compact
+          return data unless data.count > 0
+          sample = data.first
+          case sample
+          when ::CSV::Row
+            data
+          when Array
+            headers  = data.shift
+            data.map do |arr_row|
+              ::CSV::Row.new(headers, arr_row)
+            end.compact
+          when Hash
+            headers     = sample.keys
+            headers_str = headers.map(&:to_s)
+            data.map do |hash|
+              ::CSV::Row.new(headers_str, hash.values_at(*headers))
+            end.compact
+          else
+            raise "Expected data that can be transformed into Array<::CSV::Row>. Given 'Enumerable' of '#{sample.class}'"
+          end
         else
           raise "Input type not supported. Given: #{data.class}"
         end

data/lib/eco/data/fuzzy_match.rb CHANGED Viewed

@@ -28,6 +28,7 @@ module Eco
         include NGramsScore
         def jaro_winkler(str1, str2, **options)
+          return 0 if !str1 || !str2
           options = {
             ignore_case: true,
             weight:      0.25
@@ -67,28 +68,67 @@ module Eco
           @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
         end
+        # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
         # @note
         #   - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
-        # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
+        # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
+        # @param needle_str [String, nil] the actual value of needle_str to be used.
+        # @param haystack [Enumerable] the items to find `needle` among.
         # @return [Eco::Data::FuzzyMatch::Results]
-        def find_all_with_score(needle, **options)
-          results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
+        def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
+          base_match    = fuzzy_match(haystack, **options)
+          match_results = base_match.find_all_with_score(needle_str || needle)
+          needle_str  ||= item_string(needle)
+          results       = match_results.each_with_object([]) do |fuzzy_results, results|
             item, dice, lev = fuzzy_results
             unless item == needle
-              needle_str = item_string(needle)
-              item_str   = item_string(item)
-              jaro_res   = jaro(needle_str, item_str)
-              ngram_res  = ngram(needle_str, item_str)
-              wngram_res = words_ngram(needle_str, item_str)
-              pos_res    = position(needle_str, item_str)
-              results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
+              item_str     = item_string(item)
+              if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
+                dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
+              end
+              jaro_res     ||= jaro(needle_str, item_str)
+              ngram_res    ||= ngram(needle_str, item_str)
+              wngram_res   ||= words_ngram(needle_str, item_str)
+              pos_res      ||= position(needle_str, item_str)
+              results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
             end
           end
-          Results.new(needle, item_string(needle), results).tap do |res|
+          Results.new(needle, needle_str, results).tap do |res|
             res.order     = fuzzy_options[:order]     if fuzzy_options[:order]
             res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
+          end.relevant_results
+        end
+        def recalculate_results(results, needle_str: nil, **options)
+          raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
+          new_results = results.each_with_object([]) do |result, new_results|
+            nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
+            if istr.to_s.strip.empty?
+              dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 1
+            elsif nstr.to_s.strip.empty?
+              unless istr = needle_str
+                dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
+              end
+            end
+            res          = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
+            dice       ||= res&.dices_coefficient_similar || 0
+            lev        ||= res&.levenshtein_similar       || 0
+            jaro_res   ||= jaro(nstr, istr)
+            ngram_res  ||= ngram(nstr, istr)
+            wngram_res ||= words_ngram(nstr, istr)
+            pos_res    ||= position(nstr, istr)
+            new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
           end
+          Results.new(results.needle, results.value, new_results).tap do |res|
+            res.order     = options[:order]     if options[:order]
+            res.threshold = options[:threshold] if options[:threshold]
+          end.relevant_results
         end
         private

data/lib/eco/data/fuzzy_match/chars_position_score.rb CHANGED Viewed

@@ -12,8 +12,9 @@ module Eco
         def chars_position_score(str1, str2, max_distance: 3, normalized: false)
           str1, str2 = normalize_string([str1, str2]) unless normalized
           len1 = str1 && str1.length; len2 = str2 && str2.length
-          Score.new(0, len1 || 0).tap do |score|
-            next if !str1 || !str2
+          Score.new(0, 0).tap do |score|
+            next if !str2 || !str1 || str2.empty? || str1.empty?
+            score.total = len1
             next score.increase(score.total) if str1 == str2
             next if len1 < 2
             pos = 0

data/lib/eco/data/fuzzy_match/ngrams_score.rb CHANGED Viewed

@@ -16,18 +16,19 @@ module Eco
           Score.new(0, 0).tap do |score|
             next if !str2 || !str1
+            next score.increase_total(len1) if str2.empty? || str1.empty?
             if str1 == str2
-              score.increase_total(len1)
+              score.total = len1
               score.increase(score.total)
             end
             if str1.length < 2 || str1.length < 2
               score.increase_total(len1)
             end
-            paired_words(str1, str2, normalized: true) do |needle, item|
+            pairs = paired_words(str1, str2, normalized: true) do |needle, item|
               ngrams_score(needle, item, range: range, normalized: true)
-            end.each do |sub_str1, (item, iscore)|
-              #puts "pairs '#{sub_str1}' --> '#{item}' (score: #{iscore.ratio})"
+            end.each do |sub_str1, data|
+              item, iscore = data
               score.merge!(iscore)
             end
           end
@@ -44,14 +45,17 @@ module Eco
           Score.new(0, len1 || 0).tap do |score|
             next if !str2 || !str1
+            next if str2.empty? || str1.empty?
+            score.total = len1
             next score.increase(score.total) if str1 == str2
             next if str1.length < 2 || str2.length < 2
-            grams = word_ngrams(str2, range, normalized: true)
-            next unless grams.length > 0
+            grams     = word_ngrams(str2, range, normalized: true)
+            grams_count = grams.length
+            next unless grams_count > 0
             if range.is_a?(Integer)
-              item_weight = score.total.to_f / grams.length
+              item_weight = score.total.to_f / grams_count
               matches     = grams.select {|res| str1.include?(gram)}.length
               score.increase(matches * item_weight)
             else
@@ -62,9 +66,9 @@ module Eco
               groups.each do |len, grams|
                 len_max_score  = score.total * group_weight
-                item_weight    = len_max_score / grams.length
+                item_weight    = len_max_score / grams_count
                 matches        = grams.select {|gram| str1.include?(gram)}.length
-                #pp "#{len} match: #{matches} (over #{grams.length}) || max_score: #{len_max_score} (over #{score.total})"
+                #pp "(#{len}) match: #{matches} (of #{grams.length} of total #{grams_count}) || max_score: #{len_max_score} (over #{score.total})"
                 score.increase(matches * item_weight)
               end
             end

data/lib/eco/data/fuzzy_match/pairing.rb CHANGED Viewed

@@ -15,19 +15,12 @@ module Eco
         # @yieldreturn [Eco::Data::FuzzyMatch::Score] the `Score` object with the results of comparing `str1` and `str2`
         # @param str1 [String] the string of reference.
         # @param str2 [String] one of the haystack items.
-        # @param format [Symbol] determines the `values` of the returned `Hash`::
-        #   1. `:pair` for just pair
-        #   2. `:score` for just score
-        #   2. `[:pair, :score]` for `Array`
         # @normalized [Boolean] to avoid double ups in normalizing.
-        # @return [Hash] where `keys` are the **words** of `str1` and their `values`:
-        #   1. if `format` is `:pair` => the `str2` words with highest match.
-        #   2. if `format` is `:score` => the `Score` words with highest match.
-        #   3. if `format` is `[:pair, :score]` => both in an `Array`.
-        def paired_words(str1, str2, format: [:pair, :score], normalized: false)
+        # @return [Hash] where `keys` are the **words** of `str1` and their `values` a pair array of `pair` and `Score`
+        def paired_words(str1, str2, normalized: false)
           str1, str2 = normalize_string([str1, str2]) unless normalized
-          return {} if !str2 || !str1
-          return {str1 => nil} if str1.length < 2 || str1.length < 2
+          return {nil => [nil, Score.new(0, 0)]} if !str2 || !str1
+          return {str1 => [nil, Score.new(0, 0)]} if str1.length < 2 || str1.length < 2
           needles    = get_words(str1, normalized: true)
           haystack   = get_words(str2, normalized: true)
@@ -58,6 +51,9 @@ module Eco
               result[:score].ratio
             end.reverse
             if result = sorted.shift
+              unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
+                raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{result[:needle]}' and item #{item}"
+              end
               paired[result[:needle]] = {
                 pair:  item,
                 score: result[:score]
@@ -73,6 +69,9 @@ module Eco
               pending_items.include?(result[:pair]) && result[:score].ratio > 0.05
             end
             if result = results.shift
+              unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
+                raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{needle}' and item #{result[:pair]}"
+              end
               paired[needle] = result
               pending_items.delete(result[:pair])
             end
@@ -85,13 +84,8 @@ module Eco
               score: Score.new(0, needle.length)
             }
           end
-          paired.transform_values do |result|
-            case format
-            when Array
-              result.values_at(*format)
-            else
-              restult[format]
-            end
+          paired.each_with_object({}) do |(needle, data), out|
+            out[needle] = data.values_at(:pair, :score)
           end
         end

data/lib/eco/data/fuzzy_match/result.rb CHANGED Viewed

@@ -1,9 +1,11 @@
 module Eco
   module Data
     module FuzzyMatch
-      class Result < Struct.new(:match, :value, :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position)
+      class Result < Struct.new(:match, :value, :needle_value, :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position)
         ALL_METHODS = [:dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position]
+        attr_accessor :pivot
         def dice; super&.round(3); end
         def levenshtein; super&.round(3); end
         def jaro_winkler; super&.round(3); end
@@ -11,6 +13,12 @@ module Eco
         def words_ngrams; super&.round(3); end
         def chars_position; super&.round(3); end
+        #Shortcuts
+        def lev; levenshtein; end
+        def jaro; jaro_winkler; end
+        def wngrams; words_ngrams; end
+        def pos; chars_position; end
         def average
           values = [dice, levenshtein, jaro_winkler, ngrams, words_ngrams, chars_position]
           (values.inject(0.0, :+) / values.length).round(3)
@@ -55,6 +63,12 @@ module Eco
           compare(result)
         end
+        def values_at(*keys)
+          keys.map do |key|
+            self.send(key) if self.respond_to?(key)
+          end
+        end
         private
         def compare(other, order: self.order)