RubyGems - eco-helpers - Versions diffs - 2.0.18 → 2.0.24 - Mend

eco-helpers 2.0.18 → 2.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +80 -1
data/eco-helpers.gemspec +4 -1
data/lib/eco/api/common/base_loader.rb +9 -5
data/lib/eco/api/common/loaders/parser.rb +1 -0
data/lib/eco/api/common/people/default_parsers.rb +1 -0
data/lib/eco/api/common/people/default_parsers/xls_parser.rb +53 -0
data/lib/eco/api/common/people/entries.rb +1 -0
data/lib/eco/api/common/people/entry_factory.rb +88 -23
data/lib/eco/api/common/people/person_entry.rb +1 -0
data/lib/eco/api/common/people/person_parser.rb +1 -1
data/lib/eco/api/common/session.rb +1 -0
data/lib/eco/api/common/session/base_session.rb +2 -0
data/lib/eco/api/common/session/helpers.rb +30 -0
data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
data/lib/eco/api/common/version_patches/exception.rb +5 -2
data/lib/eco/api/microcases/with_each.rb +67 -6
data/lib/eco/api/microcases/with_each_present.rb +4 -2
data/lib/eco/api/microcases/with_each_starter.rb +4 -2
data/lib/eco/api/organization.rb +1 -1
data/lib/eco/api/organization/people.rb +94 -25
data/lib/eco/api/organization/people_similarity.rb +272 -0
data/lib/eco/api/organization/person_schemas.rb +5 -1
data/lib/eco/api/organization/policy_groups.rb +5 -1
data/lib/eco/api/organization/tag_tree.rb +33 -0
data/lib/eco/api/session.rb +19 -8
data/lib/eco/api/session/batch.rb +7 -5
data/lib/eco/api/session/batch/job.rb +34 -9
data/lib/eco/api/usecases.rb +2 -2
data/lib/eco/api/usecases/base_case.rb +2 -2
data/lib/eco/api/usecases/base_io.rb +17 -4
data/lib/eco/api/usecases/default_cases.rb +1 -0
data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +179 -32
data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
data/lib/eco/cli/config/default/input.rb +61 -8
data/lib/eco/cli/config/default/options.rb +47 -2
data/lib/eco/cli/config/default/people.rb +18 -24
data/lib/eco/cli/config/default/usecases.rb +33 -2
data/lib/eco/cli/config/default/workflow.rb +12 -7
data/lib/eco/cli/scripting/args_helpers.rb +2 -2
data/lib/eco/csv.rb +4 -2
data/lib/eco/csv/table.rb +121 -21
data/lib/eco/data/fuzzy_match.rb +109 -27
data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
data/lib/eco/data/fuzzy_match/ngrams_score.rb +19 -10
data/lib/eco/data/fuzzy_match/pairing.rb +12 -19
data/lib/eco/data/fuzzy_match/result.rb +22 -2
data/lib/eco/data/fuzzy_match/results.rb +30 -6
data/lib/eco/data/fuzzy_match/score.rb +12 -7
data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
data/lib/eco/version.rb +1 -1
metadata +67 -3
data/lib/eco/api/organization/people_analytics.rb +0 -60

data/lib/eco/cli/config/default/workflow.rb CHANGED Viewed

@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
           cases_with_input =  config.usecases.active(io: io).select do |usecase, data|
             io.class.input_required?(usecase.type)
           end
-          next io unless (!io.input || io.input.empty?) && !cases_with_input.empty?
+          input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
+          missing_input     = !io.input || io.input.empty?
+          next io unless missing_input && input_is_required
           if io.options.dig(:input, :entries_from)
             io = io.new(input: config.input.get(io: io))
@@ -50,8 +53,7 @@ ASSETS.cli.config do |config|
           cases_with_people = config.usecases.active(io: io).select do |usecase, data|
             io.class.people_required?(usecase.type)
           end
-          get_people = io.options.dig(:people, :get, :from) == :remote
-          next io unless !cases_with_people.empty? || get_people
+          next io if cases_with_people.empty? && !io.options.dig(:people, :get)
           io = io.new(people:  config.people(io: io))
         end
@@ -64,7 +66,8 @@ ASSETS.cli.config do |config|
     wf.before(:usecases) do |wf_cases, io|
       # save partial entries -> should be native to session.workflow
-      partial_update = io.options.dig(:people, :get, :type) == :partial
+      get_people     = io.options.dig(:people, :get)
+      partial_update = get_people && get_people.dig(:type) == :partial
       if !io.options[:dry_run] && partial_update
         partial_file = io.session.config.people.partial_cache
         io.session.file_manager.save_json(io.people, partial_file, :timestamp)
@@ -95,11 +98,12 @@ ASSETS.cli.config do |config|
       if io.session.post_launch.empty?
         wf_post.skip!
       else
-        partial_update = io.options.dig(:people, :get, :type) == :partial
+        get_people     = io.options.dig(:people, :get)
+        partial_update = get_people && get_people.dig(:type) == :partial
         if !io.options[:dry_run] && partial_update
           # get target people afresh
           people = io.session.micro.people_refresh(people: io.people, include_created: true)
-          io     = io.new(people: people)
+          io     = io.base.new(people: people)
         else
           wf_post.skip!
           msg = "Although there are post_launch cases, they will NOT be RUN"
@@ -136,7 +140,8 @@ ASSETS.cli.config do |config|
     end
     wf.on(:end) do |wf_end, io|
-      partial_update = io.options.dig(:people, :get, :type) == :partial
+      get_people     = io.options.dig(:people, :get)
+      partial_update = get_people && get_people.dig(:type) == :partial
       unless !io.options[:end_get] || io.options[:dry_run] || partial_update
         people = io.session.micro.people_cache
         io     = io.new(people: people)

data/lib/eco/cli/scripting/args_helpers.rb CHANGED Viewed

@@ -75,10 +75,10 @@ module Eco
         def get_file(key, required: false, should_exist: true)
           filename = get_arg(key, with_param: true)
           if !filename && required
-            puts "You need to specify a file '#{key} file'"
+            puts "You need to specify a file or folder '#{key} file_or_folder'"
             exit(1)
           elsif !file_exists?(filename) && should_exist && required
-            puts "This file doesn't exist '#{filename}'"
+            puts "This file/folder doesn't exist '#{filename}'"
             exit(1)
           end

data/lib/eco/csv.rb CHANGED Viewed

@@ -18,8 +18,10 @@ module Eco
         kargs = {headers: true,  skip_blanks: true}.merge(kargs)
         args = [file].tap do |arg|
-          coding = Eco::API::Common::Session::FileManager.encoding(file)
-          arg.push("rb:bom|utf-8") if coding == "bom"
+          encoding = Eco::API::Common::Session::FileManager.encoding(file)
+          #encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
+          #arg.push(encoding)
+          arg.push("rb:bom|utf-8") if encoding == "bom"
         end
         out = super(*args, **kargs).reject do |row|

data/lib/eco/csv/table.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 module Eco
   class CSV
     class Table < ::CSV::Table
@@ -9,6 +8,70 @@ module Eco
         super(to_rows_array(input))
       end
+      # @return [Hash] where keys are the groups and the values a `Eco::CSV::Table`
+      def group_by(&block)
+        rows.group_by(&block).transform_values do |rows|
+          self.class.new(rows)
+        end
+      end
+      # @return [Eco::CSV::Table]
+      def transform_values
+        transformed_rows = rows.map do |row|
+          res = yield(row)
+          case res
+          when Array
+            ::CSV::Row.new(row.headers, res)
+          when ::CSV::Row
+            res
+          end
+        end
+        self.class.new(transformed_rows)
+      end
+      # Slices the selected rows
+      # @return [Eco::CSV::Table]
+      def slice(*index)
+        case index.first
+        when Range, Numeric
+          self.class.new(rows.slice(index.first))
+        else
+          self
+        end
+      end
+      # @return [Eco::CSV::Table]
+      def slice_columns(*index)
+        case index.first
+        when Range, Numeric
+          columns_to_table(columns.slice(index.first))
+        when String
+          csv_cols = columns
+          csv_cols = index.each_with_object([]) do |name, cols|
+            col = csv_cols.find {|col| col.first == name}
+            cols << col if col
+          end
+          columns_to_table(csv_cols)
+        else
+          self
+        end
+      end
+      # @return [Eco::CSV::Table]
+      def delete_column(i)
+        csv_cols = columns
+        csv_cols.delete(i)
+        columns_to_table(csv_cols)
+      end
+      # Adds a new column at the end
+      # @param header_name [String] header of the new column
+      # @return [Eco::CSV::Table] with a new empty column
+      def add_column(header_name)
+        new_col = Array.new(length).unshift(header_name)
+        columns_to_table(columns.push(new_col))
+      end
       # @return [Array<::CSV::Row>]
       def rows
         [].tap do |out|
@@ -16,24 +79,40 @@ module Eco
         end
       end
+      # It removes all rows where all columns' values are the same
+      def delete_duplicates!
+        unique_rows = []
+        self.by_row!.delete_if do |row|
+          unique_rows.any? {|done| equal_rows?(row, done)}.tap do |found|
+            unique_rows << row unless found
+          end
+        end
+      end
+      # @param row1 [CSV:Row] row to be compared
+      # @param row2 [CSV:Row] row to be compared
+      # @param [Boolean] `true` if all values of `row1` are as of `row2`
+      def equal_rows?(row1, row2)
+        row1.fields.zip(row2.fields).all? do |(v1, v2)|
+          v1 == v2
+        end
+      end
       # @return [Integer] total number of rows not including the header
       def length
         to_a.length - 1
       end
+      def empty?
+        length < 1
+      end
       # @return [Array<Array>] each array is the column header followed by its values
       def columns
         to_a.transpose
       end
-      # Adds a new column at the end
-      # @param header_name [String] header of the new column
-      # @return [Eco::CSV::Table] with a new empty column
-      def add_column(header_name)
-        new_col = Array.new(length).unshift(header_name)
-        columns_to_table(columns.push(new_col))
-      end
+      # Creates a single `Hash` where each key, value is a column (header + values)
       # @note it will override columns with same header name
       # @return [Hash] keys are headers, values are arrays
       def columns_hash
@@ -42,6 +121,17 @@ module Eco
         end.to_h
       end
+      # Returns an array of row hashes
+      # @note it will override columns with same header
+      def to_a_h
+        rows.map(&:to_h)
+      end
+      # @see #to_a_h
+      def to_array_of_hashes
+        to_a_h
+      end
       private
       def columns_to_table(columns_array)
@@ -51,24 +141,34 @@ module Eco
       def to_rows_array(data)
         case data
-        when Array
-          return data unless data.length > 0
-          if data.first.is_a?(::CSV::Row)
-            data
-          elsif data.first.is_a?(Array)
-            headers  = data.shift
-            data.map do |arr_row|
-              CSV::Row.new(headers, arr_row)
-            end.compact
-          else
-            raise "Expected data that can be transformed into Array<Array>"
-          end
         when ::CSV::Table
           to_rows_array(data.to_a)
         when Hash
           # hash of columns header as key and column array as value
           rows_arrays = [a.keys].concat(a.values.first.zip(*a.values[1..-1]))
           to_rows_array(data.keys)
+        when Enumerable
+          data = data.dup.compact
+          return data unless data.count > 0
+          sample = data.first
+          case sample
+          when ::CSV::Row
+            data
+          when Array
+            headers  = data.shift
+            data.map do |arr_row|
+              ::CSV::Row.new(headers, arr_row)
+            end.compact
+          when Hash
+            headers     = sample.keys
+            headers_str = headers.map(&:to_s)
+            data.map do |hash|
+              ::CSV::Row.new(headers_str, hash.values_at(*headers))
+            end.compact
+          else
+            raise "Expected data that can be transformed into Array<::CSV::Row>. Given 'Enumerable' of '#{sample.class}'"
+          end
         else
           raise "Input type not supported. Given: #{data.class}"
         end

data/lib/eco/data/fuzzy_match.rb CHANGED Viewed

@@ -27,17 +27,29 @@ module Eco
         include CharsPositionScore
         include NGramsScore
-        def jaro_winkler(str1, str2)
+        def jaro_winkler(str1, str2, **options)
+          return 0 if !str1 || !str2
           options = {
             ignore_case: true,
             weight:      0.25
-          }
+          }.merge(options)
           JaroWinkler.distance(str1, str2, **options)
         end
       end
       module InstanceMethods
+        FUZZY_MATCH_OPTIONS = [
+          :identities, :groupings, :stop_words, :read,
+          :must_match_grouping, :must_match_at_least_one_word,
+          :gather_last_result, :threshold
+        ]
+        JARO_OPTIONS     = [:ignore_case, :weight]
+        NGRAMS_OPTIONS   = [:range]
+        POSITION_OPTIONS = [:max_distance]
+        RESULTS_OPTIONS  = [:order, :threshold]
         include StopWords
         attr_accessor :fuzzy_options
@@ -46,62 +58,132 @@ module Eco
           @fuzzy_options ||= {}
         end
-        def fuzzy_match(haystack = nil, **options)
-          return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
-          @fuzzy_options = options.merge({
-            stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
-          })
+        def fuzzy_match(haystack_data = nil, **options)
+          if instance_variable_defined?(:@fuzzy_match) && !haystack_data
+            return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
+          end
+          @fuzzy_options = options
           # make it run with a native C extension (for better performance: ~130 % increase of performance)
           ::FuzzyMatch.engine = :amatch
-          haystack = obtain_haystack(haystack).tap do |items|
-            if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
-              raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
-            end
-          end
-          @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
+          @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
         end
+        # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
         # @note
         #   - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
-        # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
+        # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
+        # @param needle_str [String, nil] the actual value of needle_str to be used.
+        # @param haystack [Enumerable] the items to find `needle` among.
         # @return [Eco::Data::FuzzyMatch::Results]
-        def find_all_with_score(needle, **options)
-          results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
+        def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
+          base_match    = fuzzy_match(haystack, **options)
+          match_results = base_match.find_all_with_score(needle_str || needle)
+          needle_str  ||= item_string(needle)
+          results       = match_results.each_with_object([]) do |fuzzy_results, results|
             item, dice, lev = fuzzy_results
             unless item == needle
-              needle_str = item_string(needle)
-              item_str   = item_string(item)
-              jaro_res   = self.class.jaro_winkler(needle_str, item_str)
-              ngram_res  = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
-              wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
-              pos_res    = self.class.chars_position_score(needle_str, item_str).ratio
-              results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
+              item_str     = item_string(item)
+              if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
+                dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
+              end
+              jaro_res     ||= jaro(needle_str, item_str)
+              ngram_res    ||= ngram(needle_str, item_str)
+              wngram_res   ||= words_ngram(needle_str, item_str)
+              pos_res      ||= position(needle_str, item_str)
+              results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
+            end
+          end
+          Results.new(needle, needle_str, results).tap do |res|
+            res.order     = fuzzy_options[:order]     if fuzzy_options[:order]
+            res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
+          end.relevant_results
+        end
+        def recalculate_results(results, needle_str: nil, **options)
+          raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
+          new_results = results.each_with_object([]) do |result, new_results|
+            nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
+            if istr.to_s.strip.empty?
+              dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 1
+            elsif nstr.to_s.strip.empty?
+              unless istr = needle_str
+                dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res  = 0
+              end
             end
+            res          = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
+            dice       ||= res&.dices_coefficient_similar || 0
+            lev        ||= res&.levenshtein_similar       || 0
+            jaro_res   ||= jaro(nstr, istr)
+            ngram_res  ||= ngram(nstr, istr)
+            wngram_res ||= words_ngram(nstr, istr)
+            pos_res    ||= position(nstr, istr)
+            new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
           end
-          Results.new(needle, item_string(needle), results)
+          Results.new(results.needle, results.value, new_results).tap do |res|
+            res.order     = options[:order]     if options[:order]
+            res.threshold = options[:threshold] if options[:threshold]
+          end.relevant_results
         end
         private
+        def jaro(str1, str2)
+          options = fuzzy_options.slice(*JARO_OPTIONS)
+          self.class.jaro_winkler(str1, str2, **options)
+        end
+        def ngram(str1, str2)
+          options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
+          self.class.ngrams_score(str1, str2, **options).ratio
+        end
+        def words_ngram(str1, str2)
+          options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
+          self.class.words_ngrams_score(str1, str2, **options).ratio
+        end
+        def position(str1, str2)
+          options = fuzzy_options.slice(*POSITION_OPTIONS)
+          self.class.chars_position_score(str1, str2, **options).ratio
+        end
         # @note
         #   - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
         # @param data [Enumerable, nil]
         # @return [Array<Object>] the non-repeated values of `data`
-        def obtain_haystack(data = nil)
+        def haystack(data = nil)
           data = self if self.is_a?(Enumerable) && !data
           raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
           data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
-          data.uniq.compact
+          data.uniq.compact.tap do |items|
+            if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
+              raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
+            end
+          end
         end
         def item_string(item, attr = fuzzy_read_method)
           return item if !item || item.is_a?(String) || !attr
+          return attr.call(item) if attr.is_a?(Proc)
           attr = attr.to_sym
           return item.send(attr) if item.respond_to?(attr)
         end
+        def fuzzy_match_options(options = nil)
+          options = fuzzy_options unless options
+          options.slice(*FUZZY_MATCH_OPTIONS).merge({
+            stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
+          })
+        end
         def fuzzy_read_method
-          fuzzy_options[:read]
+          fuzzy_match_options[:read]
         end
       end