eco-helpers 2.0.18 → 2.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +80 -1
  3. data/eco-helpers.gemspec +4 -1
  4. data/lib/eco/api/common/base_loader.rb +9 -5
  5. data/lib/eco/api/common/loaders/parser.rb +1 -0
  6. data/lib/eco/api/common/people/default_parsers.rb +1 -0
  7. data/lib/eco/api/common/people/default_parsers/xls_parser.rb +53 -0
  8. data/lib/eco/api/common/people/entries.rb +1 -0
  9. data/lib/eco/api/common/people/entry_factory.rb +88 -23
  10. data/lib/eco/api/common/people/person_entry.rb +1 -0
  11. data/lib/eco/api/common/people/person_parser.rb +1 -1
  12. data/lib/eco/api/common/session.rb +1 -0
  13. data/lib/eco/api/common/session/base_session.rb +2 -0
  14. data/lib/eco/api/common/session/helpers.rb +30 -0
  15. data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
  16. data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
  17. data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
  18. data/lib/eco/api/common/version_patches/exception.rb +5 -2
  19. data/lib/eco/api/microcases/with_each.rb +67 -6
  20. data/lib/eco/api/microcases/with_each_present.rb +4 -2
  21. data/lib/eco/api/microcases/with_each_starter.rb +4 -2
  22. data/lib/eco/api/organization.rb +1 -1
  23. data/lib/eco/api/organization/people.rb +94 -25
  24. data/lib/eco/api/organization/people_similarity.rb +272 -0
  25. data/lib/eco/api/organization/person_schemas.rb +5 -1
  26. data/lib/eco/api/organization/policy_groups.rb +5 -1
  27. data/lib/eco/api/organization/tag_tree.rb +33 -0
  28. data/lib/eco/api/session.rb +19 -8
  29. data/lib/eco/api/session/batch.rb +7 -5
  30. data/lib/eco/api/session/batch/job.rb +34 -9
  31. data/lib/eco/api/usecases.rb +2 -2
  32. data/lib/eco/api/usecases/base_case.rb +2 -2
  33. data/lib/eco/api/usecases/base_io.rb +17 -4
  34. data/lib/eco/api/usecases/default_cases.rb +1 -0
  35. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +179 -32
  36. data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
  37. data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
  38. data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
  39. data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
  40. data/lib/eco/cli/config/default/input.rb +61 -8
  41. data/lib/eco/cli/config/default/options.rb +47 -2
  42. data/lib/eco/cli/config/default/people.rb +18 -24
  43. data/lib/eco/cli/config/default/usecases.rb +33 -2
  44. data/lib/eco/cli/config/default/workflow.rb +12 -7
  45. data/lib/eco/cli/scripting/args_helpers.rb +2 -2
  46. data/lib/eco/csv.rb +4 -2
  47. data/lib/eco/csv/table.rb +121 -21
  48. data/lib/eco/data/fuzzy_match.rb +109 -27
  49. data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
  50. data/lib/eco/data/fuzzy_match/ngrams_score.rb +19 -10
  51. data/lib/eco/data/fuzzy_match/pairing.rb +12 -19
  52. data/lib/eco/data/fuzzy_match/result.rb +22 -2
  53. data/lib/eco/data/fuzzy_match/results.rb +30 -6
  54. data/lib/eco/data/fuzzy_match/score.rb +12 -7
  55. data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
  56. data/lib/eco/version.rb +1 -1
  57. metadata +67 -3
  58. data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
28
28
  cases_with_input = config.usecases.active(io: io).select do |usecase, data|
29
29
  io.class.input_required?(usecase.type)
30
30
  end
31
- next io unless (!io.input || io.input.empty?) && !cases_with_input.empty?
31
+
32
+ input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
33
+ missing_input = !io.input || io.input.empty?
34
+ next io unless missing_input && input_is_required
32
35
 
33
36
  if io.options.dig(:input, :entries_from)
34
37
  io = io.new(input: config.input.get(io: io))
@@ -50,8 +53,7 @@ ASSETS.cli.config do |config|
50
53
  cases_with_people = config.usecases.active(io: io).select do |usecase, data|
51
54
  io.class.people_required?(usecase.type)
52
55
  end
53
- get_people = io.options.dig(:people, :get, :from) == :remote
54
- next io unless !cases_with_people.empty? || get_people
56
+ next io if cases_with_people.empty? && !io.options.dig(:people, :get)
55
57
  io = io.new(people: config.people(io: io))
56
58
  end
57
59
 
@@ -64,7 +66,8 @@ ASSETS.cli.config do |config|
64
66
 
65
67
  wf.before(:usecases) do |wf_cases, io|
66
68
  # save partial entries -> should be native to session.workflow
67
- partial_update = io.options.dig(:people, :get, :type) == :partial
69
+ get_people = io.options.dig(:people, :get)
70
+ partial_update = get_people && get_people.dig(:type) == :partial
68
71
  if !io.options[:dry_run] && partial_update
69
72
  partial_file = io.session.config.people.partial_cache
70
73
  io.session.file_manager.save_json(io.people, partial_file, :timestamp)
@@ -95,11 +98,12 @@ ASSETS.cli.config do |config|
95
98
  if io.session.post_launch.empty?
96
99
  wf_post.skip!
97
100
  else
98
- partial_update = io.options.dig(:people, :get, :type) == :partial
101
+ get_people = io.options.dig(:people, :get)
102
+ partial_update = get_people && get_people.dig(:type) == :partial
99
103
  if !io.options[:dry_run] && partial_update
100
104
  # get target people afresh
101
105
  people = io.session.micro.people_refresh(people: io.people, include_created: true)
102
- io = io.new(people: people)
106
+ io = io.base.new(people: people)
103
107
  else
104
108
  wf_post.skip!
105
109
  msg = "Although there are post_launch cases, they will NOT be RUN"
@@ -136,7 +140,8 @@ ASSETS.cli.config do |config|
136
140
  end
137
141
 
138
142
  wf.on(:end) do |wf_end, io|
139
- partial_update = io.options.dig(:people, :get, :type) == :partial
143
+ get_people = io.options.dig(:people, :get)
144
+ partial_update = get_people && get_people.dig(:type) == :partial
140
145
  unless !io.options[:end_get] || io.options[:dry_run] || partial_update
141
146
  people = io.session.micro.people_cache
142
147
  io = io.new(people: people)
@@ -75,10 +75,10 @@ module Eco
75
75
  def get_file(key, required: false, should_exist: true)
76
76
  filename = get_arg(key, with_param: true)
77
77
  if !filename && required
78
- puts "You need to specify a file '#{key} file'"
78
+ puts "You need to specify a file or folder '#{key} file_or_folder'"
79
79
  exit(1)
80
80
  elsif !file_exists?(filename) && should_exist && required
81
- puts "This file doesn't exist '#{filename}'"
81
+ puts "This file/folder doesn't exist '#{filename}'"
82
82
  exit(1)
83
83
  end
84
84
 
data/lib/eco/csv.rb CHANGED
@@ -18,8 +18,10 @@ module Eco
18
18
  kargs = {headers: true, skip_blanks: true}.merge(kargs)
19
19
 
20
20
  args = [file].tap do |arg|
21
- coding = Eco::API::Common::Session::FileManager.encoding(file)
22
- arg.push("rb:bom|utf-8") if coding == "bom"
21
+ encoding = Eco::API::Common::Session::FileManager.encoding(file)
22
+ #encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
23
+ #arg.push(encoding)
24
+ arg.push("rb:bom|utf-8") if encoding == "bom"
23
25
  end
24
26
 
25
27
  out = super(*args, **kargs).reject do |row|
data/lib/eco/csv/table.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Eco
3
2
  class CSV
4
3
  class Table < ::CSV::Table
@@ -9,6 +8,70 @@ module Eco
9
8
  super(to_rows_array(input))
10
9
  end
11
10
 
11
+ # @return [Hash] where keys are the groups and the values a `Eco::CSV::Table`
12
+ def group_by(&block)
13
+ rows.group_by(&block).transform_values do |rows|
14
+ self.class.new(rows)
15
+ end
16
+ end
17
+
18
+ # @return [Eco::CSV::Table]
19
+ def transform_values
20
+ transformed_rows = rows.map do |row|
21
+ res = yield(row)
22
+ case res
23
+ when Array
24
+ ::CSV::Row.new(row.headers, res)
25
+ when ::CSV::Row
26
+ res
27
+ end
28
+ end
29
+ self.class.new(transformed_rows)
30
+ end
31
+
32
+ # Slices the selected rows
33
+ # @return [Eco::CSV::Table]
34
+ def slice(*index)
35
+ case index.first
36
+ when Range, Numeric
37
+ self.class.new(rows.slice(index.first))
38
+ else
39
+ self
40
+ end
41
+ end
42
+
43
+ # @return [Eco::CSV::Table]
44
+ def slice_columns(*index)
45
+ case index.first
46
+ when Range, Numeric
47
+ columns_to_table(columns.slice(index.first))
48
+ when String
49
+ csv_cols = columns
50
+ csv_cols = index.each_with_object([]) do |name, cols|
51
+ col = csv_cols.find {|col| col.first == name}
52
+ cols << col if col
53
+ end
54
+ columns_to_table(csv_cols)
55
+ else
56
+ self
57
+ end
58
+ end
59
+
60
+ # @return [Eco::CSV::Table]
61
+ def delete_column(i)
62
+ csv_cols = columns
63
+ csv_cols.delete(i)
64
+ columns_to_table(csv_cols)
65
+ end
66
+
67
+ # Adds a new column at the end
68
+ # @param header_name [String] header of the new column
69
+ # @return [Eco::CSV::Table] with a new empty column
70
+ def add_column(header_name)
71
+ new_col = Array.new(length).unshift(header_name)
72
+ columns_to_table(columns.push(new_col))
73
+ end
74
+
12
75
  # @return [Array<::CSV::Row>]
13
76
  def rows
14
77
  [].tap do |out|
@@ -16,24 +79,40 @@ module Eco
16
79
  end
17
80
  end
18
81
 
82
+ # It removes all rows where all columns' values are the same
83
+ def delete_duplicates!
84
+ unique_rows = []
85
+ self.by_row!.delete_if do |row|
86
+ unique_rows.any? {|done| equal_rows?(row, done)}.tap do |found|
87
+ unique_rows << row unless found
88
+ end
89
+ end
90
+ end
91
+
92
+ # @param row1 [CSV:Row] row to be compared
93
+ # @param row2 [CSV:Row] row to be compared
94
+ # @param [Boolean] `true` if all values of `row1` are as of `row2`
95
+ def equal_rows?(row1, row2)
96
+ row1.fields.zip(row2.fields).all? do |(v1, v2)|
97
+ v1 == v2
98
+ end
99
+ end
100
+
19
101
  # @return [Integer] total number of rows not including the header
20
102
  def length
21
103
  to_a.length - 1
22
104
  end
23
105
 
106
+ def empty?
107
+ length < 1
108
+ end
109
+
24
110
  # @return [Array<Array>] each array is the column header followed by its values
25
111
  def columns
26
112
  to_a.transpose
27
113
  end
28
114
 
29
- # Adds a new column at the end
30
- # @param header_name [String] header of the new column
31
- # @return [Eco::CSV::Table] with a new empty column
32
- def add_column(header_name)
33
- new_col = Array.new(length).unshift(header_name)
34
- columns_to_table(columns.push(new_col))
35
- end
36
-
115
+ # Creates a single `Hash` where each key, value is a column (header + values)
37
116
  # @note it will override columns with same header name
38
117
  # @return [Hash] keys are headers, values are arrays
39
118
  def columns_hash
@@ -42,6 +121,17 @@ module Eco
42
121
  end.to_h
43
122
  end
44
123
 
124
+ # Returns an array of row hashes
125
+ # @note it will override columns with same header
126
+ def to_a_h
127
+ rows.map(&:to_h)
128
+ end
129
+
130
+ # @see #to_a_h
131
+ def to_array_of_hashes
132
+ to_a_h
133
+ end
134
+
45
135
  private
46
136
 
47
137
  def columns_to_table(columns_array)
@@ -51,24 +141,34 @@ module Eco
51
141
 
52
142
  def to_rows_array(data)
53
143
  case data
54
- when Array
55
- return data unless data.length > 0
56
- if data.first.is_a?(::CSV::Row)
57
- data
58
- elsif data.first.is_a?(Array)
59
- headers = data.shift
60
- data.map do |arr_row|
61
- CSV::Row.new(headers, arr_row)
62
- end.compact
63
- else
64
- raise "Expected data that can be transformed into Array<Array>"
65
- end
66
144
  when ::CSV::Table
67
145
  to_rows_array(data.to_a)
68
146
  when Hash
69
147
  # hash of columns header as key and column array as value
70
148
  rows_arrays = [a.keys].concat(a.values.first.zip(*a.values[1..-1]))
71
149
  to_rows_array(data.keys)
150
+ when Enumerable
151
+ data = data.dup.compact
152
+ return data unless data.count > 0
153
+ sample = data.first
154
+
155
+ case sample
156
+ when ::CSV::Row
157
+ data
158
+ when Array
159
+ headers = data.shift
160
+ data.map do |arr_row|
161
+ ::CSV::Row.new(headers, arr_row)
162
+ end.compact
163
+ when Hash
164
+ headers = sample.keys
165
+ headers_str = headers.map(&:to_s)
166
+ data.map do |hash|
167
+ ::CSV::Row.new(headers_str, hash.values_at(*headers))
168
+ end.compact
169
+ else
170
+ raise "Expected data that can be transformed into Array<::CSV::Row>. Given 'Enumerable' of '#{sample.class}'"
171
+ end
72
172
  else
73
173
  raise "Input type not supported. Given: #{data.class}"
74
174
  end
@@ -27,17 +27,29 @@ module Eco
27
27
  include CharsPositionScore
28
28
  include NGramsScore
29
29
 
30
- def jaro_winkler(str1, str2)
30
+ def jaro_winkler(str1, str2, **options)
31
+ return 0 if !str1 || !str2
31
32
  options = {
32
33
  ignore_case: true,
33
34
  weight: 0.25
34
- }
35
+ }.merge(options)
35
36
  JaroWinkler.distance(str1, str2, **options)
36
37
  end
37
38
 
38
39
  end
39
40
 
40
41
  module InstanceMethods
42
+ FUZZY_MATCH_OPTIONS = [
43
+ :identities, :groupings, :stop_words, :read,
44
+ :must_match_grouping, :must_match_at_least_one_word,
45
+ :gather_last_result, :threshold
46
+ ]
47
+
48
+ JARO_OPTIONS = [:ignore_case, :weight]
49
+ NGRAMS_OPTIONS = [:range]
50
+ POSITION_OPTIONS = [:max_distance]
51
+ RESULTS_OPTIONS = [:order, :threshold]
52
+
41
53
  include StopWords
42
54
 
43
55
  attr_accessor :fuzzy_options
@@ -46,62 +58,132 @@ module Eco
46
58
  @fuzzy_options ||= {}
47
59
  end
48
60
 
49
- def fuzzy_match(haystack = nil, **options)
50
- return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
51
- @fuzzy_options = options.merge({
52
- stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
53
- })
61
+ def fuzzy_match(haystack_data = nil, **options)
62
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
63
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
64
+ end
65
+ @fuzzy_options = options
54
66
  # make it run with a native C extension (for better performance: ~130 % increase of performance)
55
67
  ::FuzzyMatch.engine = :amatch
56
- haystack = obtain_haystack(haystack).tap do |items|
57
- if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
58
- raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
59
- end
60
- end
61
- @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
68
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
62
69
  end
63
70
 
71
+ # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
64
72
  # @note
65
73
  # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
66
- # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
74
+ # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
75
+ # @param needle_str [String, nil] the actual value of needle_str to be used.
76
+ # @param haystack [Enumerable] the items to find `needle` among.
67
77
  # @return [Eco::Data::FuzzyMatch::Results]
68
- def find_all_with_score(needle, **options)
69
- results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
78
+ def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
79
+ base_match = fuzzy_match(haystack, **options)
80
+ match_results = base_match.find_all_with_score(needle_str || needle)
81
+ needle_str ||= item_string(needle)
82
+ results = match_results.each_with_object([]) do |fuzzy_results, results|
70
83
  item, dice, lev = fuzzy_results
71
84
  unless item == needle
72
- needle_str = item_string(needle)
73
- item_str = item_string(item)
74
- jaro_res = self.class.jaro_winkler(needle_str, item_str)
75
- ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
76
- wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
77
- pos_res = self.class.chars_position_score(needle_str, item_str).ratio
78
- results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
85
+ item_str = item_string(item)
86
+
87
+ if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
88
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
89
+ end
90
+
91
+ jaro_res ||= jaro(needle_str, item_str)
92
+ ngram_res ||= ngram(needle_str, item_str)
93
+ wngram_res ||= words_ngram(needle_str, item_str)
94
+ pos_res ||= position(needle_str, item_str)
95
+
96
+ results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
97
+ end
98
+ end
99
+ Results.new(needle, needle_str, results).tap do |res|
100
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
101
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
102
+ end.relevant_results
103
+ end
104
+
105
+ def recalculate_results(results, needle_str: nil, **options)
106
+ raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
107
+ new_results = results.each_with_object([]) do |result, new_results|
108
+ nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
109
+
110
+ if istr.to_s.strip.empty?
111
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
112
+ elsif nstr.to_s.strip.empty?
113
+ unless istr = needle_str
114
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
115
+ end
79
116
  end
117
+
118
+ res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
119
+ dice ||= res&.dices_coefficient_similar || 0
120
+ lev ||= res&.levenshtein_similar || 0
121
+ jaro_res ||= jaro(nstr, istr)
122
+ ngram_res ||= ngram(nstr, istr)
123
+ wngram_res ||= words_ngram(nstr, istr)
124
+ pos_res ||= position(nstr, istr)
125
+
126
+ new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
80
127
  end
81
- Results.new(needle, item_string(needle), results)
128
+ Results.new(results.needle, results.value, new_results).tap do |res|
129
+ res.order = options[:order] if options[:order]
130
+ res.threshold = options[:threshold] if options[:threshold]
131
+ end.relevant_results
82
132
  end
83
133
 
84
134
  private
85
135
 
136
+ def jaro(str1, str2)
137
+ options = fuzzy_options.slice(*JARO_OPTIONS)
138
+ self.class.jaro_winkler(str1, str2, **options)
139
+ end
140
+
141
+ def ngram(str1, str2)
142
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
143
+ self.class.ngrams_score(str1, str2, **options).ratio
144
+ end
145
+
146
+ def words_ngram(str1, str2)
147
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
148
+ self.class.words_ngrams_score(str1, str2, **options).ratio
149
+ end
150
+
151
+ def position(str1, str2)
152
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
153
+ self.class.chars_position_score(str1, str2, **options).ratio
154
+ end
155
+
86
156
  # @note
87
157
  # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
88
158
  # @param data [Enumerable, nil]
89
159
  # @return [Array<Object>] the non-repeated values of `data`
90
- def obtain_haystack(data = nil)
160
+ def haystack(data = nil)
91
161
  data = self if self.is_a?(Enumerable) && !data
92
162
  raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
93
163
  data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
94
- data.uniq.compact
164
+ data.uniq.compact.tap do |items|
165
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
166
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
167
+ end
168
+ end
95
169
  end
96
170
 
97
171
  def item_string(item, attr = fuzzy_read_method)
98
172
  return item if !item || item.is_a?(String) || !attr
173
+ return attr.call(item) if attr.is_a?(Proc)
99
174
  attr = attr.to_sym
100
175
  return item.send(attr) if item.respond_to?(attr)
101
176
  end
102
177
 
178
+ def fuzzy_match_options(options = nil)
179
+ options = fuzzy_options unless options
180
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
181
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
182
+ })
183
+ end
184
+
103
185
  def fuzzy_read_method
104
- fuzzy_options[:read]
186
+ fuzzy_match_options[:read]
105
187
  end
106
188
 
107
189
  end