eco-helpers 2.0.18 → 2.0.24

Sign up to get free protection for your applications and to get access to all the features.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +80 -1
  3. data/eco-helpers.gemspec +4 -1
  4. data/lib/eco/api/common/base_loader.rb +9 -5
  5. data/lib/eco/api/common/loaders/parser.rb +1 -0
  6. data/lib/eco/api/common/people/default_parsers.rb +1 -0
  7. data/lib/eco/api/common/people/default_parsers/xls_parser.rb +53 -0
  8. data/lib/eco/api/common/people/entries.rb +1 -0
  9. data/lib/eco/api/common/people/entry_factory.rb +88 -23
  10. data/lib/eco/api/common/people/person_entry.rb +1 -0
  11. data/lib/eco/api/common/people/person_parser.rb +1 -1
  12. data/lib/eco/api/common/session.rb +1 -0
  13. data/lib/eco/api/common/session/base_session.rb +2 -0
  14. data/lib/eco/api/common/session/helpers.rb +30 -0
  15. data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
  16. data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
  17. data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
  18. data/lib/eco/api/common/version_patches/exception.rb +5 -2
  19. data/lib/eco/api/microcases/with_each.rb +67 -6
  20. data/lib/eco/api/microcases/with_each_present.rb +4 -2
  21. data/lib/eco/api/microcases/with_each_starter.rb +4 -2
  22. data/lib/eco/api/organization.rb +1 -1
  23. data/lib/eco/api/organization/people.rb +94 -25
  24. data/lib/eco/api/organization/people_similarity.rb +272 -0
  25. data/lib/eco/api/organization/person_schemas.rb +5 -1
  26. data/lib/eco/api/organization/policy_groups.rb +5 -1
  27. data/lib/eco/api/organization/tag_tree.rb +33 -0
  28. data/lib/eco/api/session.rb +19 -8
  29. data/lib/eco/api/session/batch.rb +7 -5
  30. data/lib/eco/api/session/batch/job.rb +34 -9
  31. data/lib/eco/api/usecases.rb +2 -2
  32. data/lib/eco/api/usecases/base_case.rb +2 -2
  33. data/lib/eco/api/usecases/base_io.rb +17 -4
  34. data/lib/eco/api/usecases/default_cases.rb +1 -0
  35. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +179 -32
  36. data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
  37. data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
  38. data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
  39. data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
  40. data/lib/eco/cli/config/default/input.rb +61 -8
  41. data/lib/eco/cli/config/default/options.rb +47 -2
  42. data/lib/eco/cli/config/default/people.rb +18 -24
  43. data/lib/eco/cli/config/default/usecases.rb +33 -2
  44. data/lib/eco/cli/config/default/workflow.rb +12 -7
  45. data/lib/eco/cli/scripting/args_helpers.rb +2 -2
  46. data/lib/eco/csv.rb +4 -2
  47. data/lib/eco/csv/table.rb +121 -21
  48. data/lib/eco/data/fuzzy_match.rb +109 -27
  49. data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
  50. data/lib/eco/data/fuzzy_match/ngrams_score.rb +19 -10
  51. data/lib/eco/data/fuzzy_match/pairing.rb +12 -19
  52. data/lib/eco/data/fuzzy_match/result.rb +22 -2
  53. data/lib/eco/data/fuzzy_match/results.rb +30 -6
  54. data/lib/eco/data/fuzzy_match/score.rb +12 -7
  55. data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
  56. data/lib/eco/version.rb +1 -1
  57. metadata +67 -3
  58. data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
28
28
  cases_with_input = config.usecases.active(io: io).select do |usecase, data|
29
29
  io.class.input_required?(usecase.type)
30
30
  end
31
- next io unless (!io.input || io.input.empty?) && !cases_with_input.empty?
31
+
32
+ input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
33
+ missing_input = !io.input || io.input.empty?
34
+ next io unless missing_input && input_is_required
32
35
 
33
36
  if io.options.dig(:input, :entries_from)
34
37
  io = io.new(input: config.input.get(io: io))
@@ -50,8 +53,7 @@ ASSETS.cli.config do |config|
50
53
  cases_with_people = config.usecases.active(io: io).select do |usecase, data|
51
54
  io.class.people_required?(usecase.type)
52
55
  end
53
- get_people = io.options.dig(:people, :get, :from) == :remote
54
- next io unless !cases_with_people.empty? || get_people
56
+ next io if cases_with_people.empty? && !io.options.dig(:people, :get)
55
57
  io = io.new(people: config.people(io: io))
56
58
  end
57
59
 
@@ -64,7 +66,8 @@ ASSETS.cli.config do |config|
64
66
 
65
67
  wf.before(:usecases) do |wf_cases, io|
66
68
  # save partial entries -> should be native to session.workflow
67
- partial_update = io.options.dig(:people, :get, :type) == :partial
69
+ get_people = io.options.dig(:people, :get)
70
+ partial_update = get_people && get_people.dig(:type) == :partial
68
71
  if !io.options[:dry_run] && partial_update
69
72
  partial_file = io.session.config.people.partial_cache
70
73
  io.session.file_manager.save_json(io.people, partial_file, :timestamp)
@@ -95,11 +98,12 @@ ASSETS.cli.config do |config|
95
98
  if io.session.post_launch.empty?
96
99
  wf_post.skip!
97
100
  else
98
- partial_update = io.options.dig(:people, :get, :type) == :partial
101
+ get_people = io.options.dig(:people, :get)
102
+ partial_update = get_people && get_people.dig(:type) == :partial
99
103
  if !io.options[:dry_run] && partial_update
100
104
  # get target people afresh
101
105
  people = io.session.micro.people_refresh(people: io.people, include_created: true)
102
- io = io.new(people: people)
106
+ io = io.base.new(people: people)
103
107
  else
104
108
  wf_post.skip!
105
109
  msg = "Although there are post_launch cases, they will NOT be RUN"
@@ -136,7 +140,8 @@ ASSETS.cli.config do |config|
136
140
  end
137
141
 
138
142
  wf.on(:end) do |wf_end, io|
139
- partial_update = io.options.dig(:people, :get, :type) == :partial
143
+ get_people = io.options.dig(:people, :get)
144
+ partial_update = get_people && get_people.dig(:type) == :partial
140
145
  unless !io.options[:end_get] || io.options[:dry_run] || partial_update
141
146
  people = io.session.micro.people_cache
142
147
  io = io.new(people: people)
@@ -75,10 +75,10 @@ module Eco
75
75
  def get_file(key, required: false, should_exist: true)
76
76
  filename = get_arg(key, with_param: true)
77
77
  if !filename && required
78
- puts "You need to specify a file '#{key} file'"
78
+ puts "You need to specify a file or folder '#{key} file_or_folder'"
79
79
  exit(1)
80
80
  elsif !file_exists?(filename) && should_exist && required
81
- puts "This file doesn't exist '#{filename}'"
81
+ puts "This file/folder doesn't exist '#{filename}'"
82
82
  exit(1)
83
83
  end
84
84
 
data/lib/eco/csv.rb CHANGED
@@ -18,8 +18,10 @@ module Eco
18
18
  kargs = {headers: true, skip_blanks: true}.merge(kargs)
19
19
 
20
20
  args = [file].tap do |arg|
21
- coding = Eco::API::Common::Session::FileManager.encoding(file)
22
- arg.push("rb:bom|utf-8") if coding == "bom"
21
+ encoding = Eco::API::Common::Session::FileManager.encoding(file)
22
+ #encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
23
+ #arg.push(encoding)
24
+ arg.push("rb:bom|utf-8") if encoding == "bom"
23
25
  end
24
26
 
25
27
  out = super(*args, **kargs).reject do |row|
data/lib/eco/csv/table.rb CHANGED
@@ -1,4 +1,3 @@
1
-
2
1
  module Eco
3
2
  class CSV
4
3
  class Table < ::CSV::Table
@@ -9,6 +8,70 @@ module Eco
9
8
  super(to_rows_array(input))
10
9
  end
11
10
 
11
+ # @return [Hash] where keys are the groups and the values a `Eco::CSV::Table`
12
+ def group_by(&block)
13
+ rows.group_by(&block).transform_values do |rows|
14
+ self.class.new(rows)
15
+ end
16
+ end
17
+
18
+ # @return [Eco::CSV::Table]
19
+ def transform_values
20
+ transformed_rows = rows.map do |row|
21
+ res = yield(row)
22
+ case res
23
+ when Array
24
+ ::CSV::Row.new(row.headers, res)
25
+ when ::CSV::Row
26
+ res
27
+ end
28
+ end
29
+ self.class.new(transformed_rows)
30
+ end
31
+
32
+ # Slices the selected rows
33
+ # @return [Eco::CSV::Table]
34
+ def slice(*index)
35
+ case index.first
36
+ when Range, Numeric
37
+ self.class.new(rows.slice(index.first))
38
+ else
39
+ self
40
+ end
41
+ end
42
+
43
+ # @return [Eco::CSV::Table]
44
+ def slice_columns(*index)
45
+ case index.first
46
+ when Range, Numeric
47
+ columns_to_table(columns.slice(index.first))
48
+ when String
49
+ csv_cols = columns
50
+ csv_cols = index.each_with_object([]) do |name, cols|
51
+ col = csv_cols.find {|col| col.first == name}
52
+ cols << col if col
53
+ end
54
+ columns_to_table(csv_cols)
55
+ else
56
+ self
57
+ end
58
+ end
59
+
60
+ # @return [Eco::CSV::Table]
61
+ def delete_column(i)
62
+ csv_cols = columns
63
+ csv_cols.delete(i)
64
+ columns_to_table(csv_cols)
65
+ end
66
+
67
+ # Adds a new column at the end
68
+ # @param header_name [String] header of the new column
69
+ # @return [Eco::CSV::Table] with a new empty column
70
+ def add_column(header_name)
71
+ new_col = Array.new(length).unshift(header_name)
72
+ columns_to_table(columns.push(new_col))
73
+ end
74
+
12
75
  # @return [Array<::CSV::Row>]
13
76
  def rows
14
77
  [].tap do |out|
@@ -16,24 +79,40 @@ module Eco
16
79
  end
17
80
  end
18
81
 
82
+ # It removes all rows where all columns' values are the same
83
+ def delete_duplicates!
84
+ unique_rows = []
85
+ self.by_row!.delete_if do |row|
86
+ unique_rows.any? {|done| equal_rows?(row, done)}.tap do |found|
87
+ unique_rows << row unless found
88
+ end
89
+ end
90
+ end
91
+
92
+ # @param row1 [CSV:Row] row to be compared
93
+ # @param row2 [CSV:Row] row to be compared
94
+ # @param [Boolean] `true` if all values of `row1` are as of `row2`
95
+ def equal_rows?(row1, row2)
96
+ row1.fields.zip(row2.fields).all? do |(v1, v2)|
97
+ v1 == v2
98
+ end
99
+ end
100
+
19
101
  # @return [Integer] total number of rows not including the header
20
102
  def length
21
103
  to_a.length - 1
22
104
  end
23
105
 
106
+ def empty?
107
+ length < 1
108
+ end
109
+
24
110
  # @return [Array<Array>] each array is the column header followed by its values
25
111
  def columns
26
112
  to_a.transpose
27
113
  end
28
114
 
29
- # Adds a new column at the end
30
- # @param header_name [String] header of the new column
31
- # @return [Eco::CSV::Table] with a new empty column
32
- def add_column(header_name)
33
- new_col = Array.new(length).unshift(header_name)
34
- columns_to_table(columns.push(new_col))
35
- end
36
-
115
+ # Creates a single `Hash` where each key, value is a column (header + values)
37
116
  # @note it will override columns with same header name
38
117
  # @return [Hash] keys are headers, values are arrays
39
118
  def columns_hash
@@ -42,6 +121,17 @@ module Eco
42
121
  end.to_h
43
122
  end
44
123
 
124
+ # Returns an array of row hashes
125
+ # @note it will override columns with same header
126
+ def to_a_h
127
+ rows.map(&:to_h)
128
+ end
129
+
130
+ # @see #to_a_h
131
+ def to_array_of_hashes
132
+ to_a_h
133
+ end
134
+
45
135
  private
46
136
 
47
137
  def columns_to_table(columns_array)
@@ -51,24 +141,34 @@ module Eco
51
141
 
52
142
  def to_rows_array(data)
53
143
  case data
54
- when Array
55
- return data unless data.length > 0
56
- if data.first.is_a?(::CSV::Row)
57
- data
58
- elsif data.first.is_a?(Array)
59
- headers = data.shift
60
- data.map do |arr_row|
61
- CSV::Row.new(headers, arr_row)
62
- end.compact
63
- else
64
- raise "Expected data that can be transformed into Array<Array>"
65
- end
66
144
  when ::CSV::Table
67
145
  to_rows_array(data.to_a)
68
146
  when Hash
69
147
  # hash of columns header as key and column array as value
70
148
  rows_arrays = [a.keys].concat(a.values.first.zip(*a.values[1..-1]))
71
149
  to_rows_array(data.keys)
150
+ when Enumerable
151
+ data = data.dup.compact
152
+ return data unless data.count > 0
153
+ sample = data.first
154
+
155
+ case sample
156
+ when ::CSV::Row
157
+ data
158
+ when Array
159
+ headers = data.shift
160
+ data.map do |arr_row|
161
+ ::CSV::Row.new(headers, arr_row)
162
+ end.compact
163
+ when Hash
164
+ headers = sample.keys
165
+ headers_str = headers.map(&:to_s)
166
+ data.map do |hash|
167
+ ::CSV::Row.new(headers_str, hash.values_at(*headers))
168
+ end.compact
169
+ else
170
+ raise "Expected data that can be transformed into Array<::CSV::Row>. Given 'Enumerable' of '#{sample.class}'"
171
+ end
72
172
  else
73
173
  raise "Input type not supported. Given: #{data.class}"
74
174
  end
@@ -27,17 +27,29 @@ module Eco
27
27
  include CharsPositionScore
28
28
  include NGramsScore
29
29
 
30
- def jaro_winkler(str1, str2)
30
+ def jaro_winkler(str1, str2, **options)
31
+ return 0 if !str1 || !str2
31
32
  options = {
32
33
  ignore_case: true,
33
34
  weight: 0.25
34
- }
35
+ }.merge(options)
35
36
  JaroWinkler.distance(str1, str2, **options)
36
37
  end
37
38
 
38
39
  end
39
40
 
40
41
  module InstanceMethods
42
+ FUZZY_MATCH_OPTIONS = [
43
+ :identities, :groupings, :stop_words, :read,
44
+ :must_match_grouping, :must_match_at_least_one_word,
45
+ :gather_last_result, :threshold
46
+ ]
47
+
48
+ JARO_OPTIONS = [:ignore_case, :weight]
49
+ NGRAMS_OPTIONS = [:range]
50
+ POSITION_OPTIONS = [:max_distance]
51
+ RESULTS_OPTIONS = [:order, :threshold]
52
+
41
53
  include StopWords
42
54
 
43
55
  attr_accessor :fuzzy_options
@@ -46,62 +58,132 @@ module Eco
46
58
  @fuzzy_options ||= {}
47
59
  end
48
60
 
49
- def fuzzy_match(haystack = nil, **options)
50
- return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
51
- @fuzzy_options = options.merge({
52
- stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
53
- })
61
+ def fuzzy_match(haystack_data = nil, **options)
62
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
63
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
64
+ end
65
+ @fuzzy_options = options
54
66
  # make it run with a native C extension (for better performance: ~130 % increase of performance)
55
67
  ::FuzzyMatch.engine = :amatch
56
- haystack = obtain_haystack(haystack).tap do |items|
57
- if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
58
- raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
59
- end
60
- end
61
- @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
68
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
62
69
  end
63
70
 
71
+ # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
64
72
  # @note
65
73
  # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
66
- # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
74
+ # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
75
+ # @param needle_str [String, nil] the actual value of needle_str to be used.
76
+ # @param haystack [Enumerable] the items to find `needle` among.
67
77
  # @return [Eco::Data::FuzzyMatch::Results]
68
- def find_all_with_score(needle, **options)
69
- results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
78
+ def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
79
+ base_match = fuzzy_match(haystack, **options)
80
+ match_results = base_match.find_all_with_score(needle_str || needle)
81
+ needle_str ||= item_string(needle)
82
+ results = match_results.each_with_object([]) do |fuzzy_results, results|
70
83
  item, dice, lev = fuzzy_results
71
84
  unless item == needle
72
- needle_str = item_string(needle)
73
- item_str = item_string(item)
74
- jaro_res = self.class.jaro_winkler(needle_str, item_str)
75
- ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
76
- wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
77
- pos_res = self.class.chars_position_score(needle_str, item_str).ratio
78
- results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
85
+ item_str = item_string(item)
86
+
87
+ if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
88
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
89
+ end
90
+
91
+ jaro_res ||= jaro(needle_str, item_str)
92
+ ngram_res ||= ngram(needle_str, item_str)
93
+ wngram_res ||= words_ngram(needle_str, item_str)
94
+ pos_res ||= position(needle_str, item_str)
95
+
96
+ results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
97
+ end
98
+ end
99
+ Results.new(needle, needle_str, results).tap do |res|
100
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
101
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
102
+ end.relevant_results
103
+ end
104
+
105
+ def recalculate_results(results, needle_str: nil, **options)
106
+ raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
107
+ new_results = results.each_with_object([]) do |result, new_results|
108
+ nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
109
+
110
+ if istr.to_s.strip.empty?
111
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
112
+ elsif nstr.to_s.strip.empty?
113
+ unless istr = needle_str
114
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
115
+ end
79
116
  end
117
+
118
+ res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
119
+ dice ||= res&.dices_coefficient_similar || 0
120
+ lev ||= res&.levenshtein_similar || 0
121
+ jaro_res ||= jaro(nstr, istr)
122
+ ngram_res ||= ngram(nstr, istr)
123
+ wngram_res ||= words_ngram(nstr, istr)
124
+ pos_res ||= position(nstr, istr)
125
+
126
+ new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
80
127
  end
81
- Results.new(needle, item_string(needle), results)
128
+ Results.new(results.needle, results.value, new_results).tap do |res|
129
+ res.order = options[:order] if options[:order]
130
+ res.threshold = options[:threshold] if options[:threshold]
131
+ end.relevant_results
82
132
  end
83
133
 
84
134
  private
85
135
 
136
+ def jaro(str1, str2)
137
+ options = fuzzy_options.slice(*JARO_OPTIONS)
138
+ self.class.jaro_winkler(str1, str2, **options)
139
+ end
140
+
141
+ def ngram(str1, str2)
142
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
143
+ self.class.ngrams_score(str1, str2, **options).ratio
144
+ end
145
+
146
+ def words_ngram(str1, str2)
147
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
148
+ self.class.words_ngrams_score(str1, str2, **options).ratio
149
+ end
150
+
151
+ def position(str1, str2)
152
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
153
+ self.class.chars_position_score(str1, str2, **options).ratio
154
+ end
155
+
86
156
  # @note
87
157
  # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
88
158
  # @param data [Enumerable, nil]
89
159
  # @return [Array<Object>] the non-repeated values of `data`
90
- def obtain_haystack(data = nil)
160
+ def haystack(data = nil)
91
161
  data = self if self.is_a?(Enumerable) && !data
92
162
  raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
93
163
  data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
94
- data.uniq.compact
164
+ data.uniq.compact.tap do |items|
165
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
166
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
167
+ end
168
+ end
95
169
  end
96
170
 
97
171
  def item_string(item, attr = fuzzy_read_method)
98
172
  return item if !item || item.is_a?(String) || !attr
173
+ return attr.call(item) if attr.is_a?(Proc)
99
174
  attr = attr.to_sym
100
175
  return item.send(attr) if item.respond_to?(attr)
101
176
  end
102
177
 
178
+ def fuzzy_match_options(options = nil)
179
+ options = fuzzy_options unless options
180
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
181
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
182
+ })
183
+ end
184
+
103
185
  def fuzzy_read_method
104
- fuzzy_options[:read]
186
+ fuzzy_match_options[:read]
105
187
  end
106
188
 
107
189
  end