eco-helpers 2.0.19 → 2.0.25

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +77 -1
  3. data/eco-helpers.gemspec +4 -1
  4. data/lib/eco/api/common/base_loader.rb +9 -5
  5. data/lib/eco/api/common/loaders/parser.rb +1 -0
  6. data/lib/eco/api/common/people/default_parsers.rb +1 -0
  7. data/lib/eco/api/common/people/default_parsers/xls_parser.rb +53 -0
  8. data/lib/eco/api/common/people/entries.rb +1 -0
  9. data/lib/eco/api/common/people/entry_factory.rb +64 -16
  10. data/lib/eco/api/common/people/person_parser.rb +1 -1
  11. data/lib/eco/api/common/version_patches/exception.rb +5 -2
  12. data/lib/eco/api/organization/people.rb +8 -2
  13. data/lib/eco/api/organization/people_similarity.rb +171 -11
  14. data/lib/eco/api/organization/tag_tree.rb +33 -0
  15. data/lib/eco/api/session.rb +15 -7
  16. data/lib/eco/api/session/batch.rb +1 -1
  17. data/lib/eco/api/session/batch/job.rb +34 -9
  18. data/lib/eco/api/usecases.rb +2 -2
  19. data/lib/eco/api/usecases/base_case.rb +2 -2
  20. data/lib/eco/api/usecases/base_io.rb +17 -4
  21. data/lib/eco/api/usecases/default_cases.rb +1 -0
  22. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +189 -19
  23. data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
  24. data/lib/eco/api/usecases/default_cases/hris_case.rb +20 -0
  25. data/lib/eco/cli/config/default/input.rb +61 -8
  26. data/lib/eco/cli/config/default/options.rb +46 -2
  27. data/lib/eco/cli/config/default/people.rb +18 -24
  28. data/lib/eco/cli/config/default/usecases.rb +31 -2
  29. data/lib/eco/cli/config/default/workflow.rb +8 -6
  30. data/lib/eco/cli/scripting/args_helpers.rb +2 -2
  31. data/lib/eco/csv/table.rb +121 -21
  32. data/lib/eco/data/fuzzy_match.rb +52 -12
  33. data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
  34. data/lib/eco/data/fuzzy_match/ngrams_score.rb +13 -9
  35. data/lib/eco/data/fuzzy_match/pairing.rb +12 -18
  36. data/lib/eco/data/fuzzy_match/result.rb +15 -1
  37. data/lib/eco/data/fuzzy_match/results.rb +18 -0
  38. data/lib/eco/data/fuzzy_match/score.rb +12 -7
  39. data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
  40. data/lib/eco/language/models/collection.rb +5 -2
  41. data/lib/eco/version.rb +1 -1
  42. metadata +64 -2
@@ -28,6 +28,7 @@ module Eco
28
28
  include NGramsScore
29
29
 
30
30
  def jaro_winkler(str1, str2, **options)
31
+ return 0 if !str1 || !str2
31
32
  options = {
32
33
  ignore_case: true,
33
34
  weight: 0.25
@@ -67,28 +68,67 @@ module Eco
67
68
  @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
68
69
  end
69
70
 
71
+ # TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
70
72
  # @note
71
73
  # - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
72
- # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
74
+ # @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
75
+ # @param needle_str [String, nil] the actual value of needle_str to be used.
76
+ # @param haystack [Enumerable] the items to find `needle` among.
73
77
  # @return [Eco::Data::FuzzyMatch::Results]
74
- def find_all_with_score(needle, **options)
75
- results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
78
+ def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
79
+ base_match = fuzzy_match(haystack, **options)
80
+ match_results = base_match.find_all_with_score(needle_str || needle)
81
+ needle_str ||= item_string(needle)
82
+ results = match_results.each_with_object([]) do |fuzzy_results, results|
76
83
  item, dice, lev = fuzzy_results
77
84
  unless item == needle
78
- needle_str = item_string(needle)
79
- item_str = item_string(item)
80
- jaro_res = jaro(needle_str, item_str)
81
- ngram_res = ngram(needle_str, item_str)
82
- wngram_res = words_ngram(needle_str, item_str)
83
- pos_res = position(needle_str, item_str)
84
-
85
- results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
85
+ item_str = item_string(item)
86
+
87
+ if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
88
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
89
+ end
90
+
91
+ jaro_res ||= jaro(needle_str, item_str)
92
+ ngram_res ||= ngram(needle_str, item_str)
93
+ wngram_res ||= words_ngram(needle_str, item_str)
94
+ pos_res ||= position(needle_str, item_str)
95
+
96
+ results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
86
97
  end
87
98
  end
88
- Results.new(needle, item_string(needle), results).tap do |res|
99
+ Results.new(needle, needle_str, results).tap do |res|
89
100
  res.order = fuzzy_options[:order] if fuzzy_options[:order]
90
101
  res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
102
+ end.relevant_results
103
+ end
104
+
105
+ def recalculate_results(results, needle_str: nil, **options)
106
+ raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
107
+ new_results = results.each_with_object([]) do |result, new_results|
108
+ nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
109
+
110
+ if istr.to_s.strip.empty?
111
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
112
+ elsif nstr.to_s.strip.empty?
113
+ unless istr = needle_str
114
+ dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
115
+ end
116
+ end
117
+
118
+ res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
119
+ dice ||= res&.dices_coefficient_similar || 0
120
+ lev ||= res&.levenshtein_similar || 0
121
+ jaro_res ||= jaro(nstr, istr)
122
+ ngram_res ||= ngram(nstr, istr)
123
+ wngram_res ||= words_ngram(nstr, istr)
124
+ pos_res ||= position(nstr, istr)
125
+
126
+ new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
91
127
  end
128
+ Results.new(results.needle, results.value, new_results).tap do |res|
129
+ res.order = options[:order] if options[:order]
130
+ res.threshold = options[:threshold] if options[:threshold]
131
+ end.relevant_results
92
132
  end
93
133
 
94
134
  private
@@ -12,8 +12,9 @@ module Eco
12
12
  def chars_position_score(str1, str2, max_distance: 3, normalized: false)
13
13
  str1, str2 = normalize_string([str1, str2]) unless normalized
14
14
  len1 = str1 && str1.length; len2 = str2 && str2.length
15
- Score.new(0, len1 || 0).tap do |score|
16
- next if !str1 || !str2
15
+ Score.new(0, 0).tap do |score|
16
+ next if !str2 || !str1 || str2.empty? || str1.empty?
17
+ score.total = len1
17
18
  next score.increase(score.total) if str1 == str2
18
19
  next if len1 < 2
19
20
  pos = 0
@@ -16,18 +16,19 @@ module Eco
16
16
 
17
17
  Score.new(0, 0).tap do |score|
18
18
  next if !str2 || !str1
19
+ next score.increase_total(len1) if str2.empty? || str1.empty?
19
20
  if str1 == str2
20
- score.increase_total(len1)
21
+ score.total = len1
21
22
  score.increase(score.total)
22
23
  end
23
24
  if str1.length < 2 || str1.length < 2
24
25
  score.increase_total(len1)
25
26
  end
26
27
 
27
- paired_words(str1, str2, normalized: true) do |needle, item|
28
+ pairs = paired_words(str1, str2, normalized: true) do |needle, item|
28
29
  ngrams_score(needle, item, range: range, normalized: true)
29
- end.each do |sub_str1, (item, iscore)|
30
- #puts "pairs '#{sub_str1}' --> '#{item}' (score: #{iscore.ratio})"
30
+ end.each do |sub_str1, data|
31
+ item, iscore = data
31
32
  score.merge!(iscore)
32
33
  end
33
34
  end
@@ -44,14 +45,17 @@ module Eco
44
45
 
45
46
  Score.new(0, len1 || 0).tap do |score|
46
47
  next if !str2 || !str1
48
+ next if str2.empty? || str1.empty?
49
+ score.total = len1
47
50
  next score.increase(score.total) if str1 == str2
48
51
  next if str1.length < 2 || str2.length < 2
49
52
 
50
- grams = word_ngrams(str2, range, normalized: true)
51
- next unless grams.length > 0
53
+ grams = word_ngrams(str2, range, normalized: true)
54
+ grams_count = grams.length
55
+ next unless grams_count > 0
52
56
 
53
57
  if range.is_a?(Integer)
54
- item_weight = score.total.to_f / grams.length
58
+ item_weight = score.total.to_f / grams_count
55
59
  matches = grams.select {|res| str1.include?(gram)}.length
56
60
  score.increase(matches * item_weight)
57
61
  else
@@ -62,9 +66,9 @@ module Eco
62
66
 
63
67
  groups.each do |len, grams|
64
68
  len_max_score = score.total * group_weight
65
- item_weight = len_max_score / grams.length
69
+ item_weight = len_max_score / grams_count
66
70
  matches = grams.select {|gram| str1.include?(gram)}.length
67
- #pp "#{len} match: #{matches} (over #{grams.length}) || max_score: #{len_max_score} (over #{score.total})"
71
+ #pp "(#{len}) match: #{matches} (of #{grams.length} of total #{grams_count}) || max_score: #{len_max_score} (over #{score.total})"
68
72
  score.increase(matches * item_weight)
69
73
  end
70
74
  end
@@ -15,19 +15,12 @@ module Eco
15
15
  # @yieldreturn [Eco::Data::FuzzyMatch::Score] the `Score` object with the results of comparing `str1` and `str2`
16
16
  # @param str1 [String] the string of reference.
17
17
  # @param str2 [String] one of the haystack items.
18
- # @param format [Symbol] determines the `values` of the returned `Hash`::
19
- # 1. `:pair` for just pair
20
- # 2. `:score` for just score
21
- # 2. `[:pair, :score]` for `Array`
22
18
  # @normalized [Boolean] to avoid double ups in normalizing.
23
- # @return [Hash] where `keys` are the **words** of `str1` and their `values`:
24
- # 1. if `format` is `:pair` => the `str2` words with highest match.
25
- # 2. if `format` is `:score` => the `Score` words with highest match.
26
- # 3. if `format` is `[:pair, :score]` => both in an `Array`.
27
- def paired_words(str1, str2, format: [:pair, :score], normalized: false)
19
+ # @return [Hash] where `keys` are the **words** of `str1` and their `values` a pair array of `pair` and `Score`
20
+ def paired_words(str1, str2, normalized: false)
28
21
  str1, str2 = normalize_string([str1, str2]) unless normalized
29
- return {} if !str2 || !str1
30
- return {str1 => nil} if str1.length < 2 || str1.length < 2
22
+ return {nil => [nil, Score.new(0, 0)]} if !str2 || !str1
23
+ return {str1 => [nil, Score.new(0, 0)]} if str1.length < 2 || str1.length < 2
31
24
 
32
25
  needles = get_words(str1, normalized: true)
33
26
  haystack = get_words(str2, normalized: true)
@@ -58,6 +51,9 @@ module Eco
58
51
  result[:score].ratio
59
52
  end.reverse
60
53
  if result = sorted.shift
54
+ unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
55
+ raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{result[:needle]}' and item #{item}"
56
+ end
61
57
  paired[result[:needle]] = {
62
58
  pair: item,
63
59
  score: result[:score]
@@ -73,6 +69,9 @@ module Eco
73
69
  pending_items.include?(result[:pair]) && result[:score].ratio > 0.05
74
70
  end
75
71
  if result = results.shift
72
+ unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
73
+ raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{needle}' and item #{result[:pair]}"
74
+ end
76
75
  paired[needle] = result
77
76
  pending_items.delete(result[:pair])
78
77
  end
@@ -85,13 +84,8 @@ module Eco
85
84
  score: Score.new(0, needle.length)
86
85
  }
87
86
  end
88
- paired.transform_values do |result|
89
- case format
90
- when Array
91
- result.values_at(*format)
92
- else
93
- restult[format]
94
- end
87
+ paired.each_with_object({}) do |(needle, data), out|
88
+ out[needle] = data.values_at(:pair, :score)
95
89
  end
96
90
  end
97
91
 
@@ -1,9 +1,11 @@
1
1
  module Eco
2
2
  module Data
3
3
  module FuzzyMatch
4
- class Result < Struct.new(:match, :value, :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position)
4
+ class Result < Struct.new(:match, :value, :needle_value, :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position)
5
5
  ALL_METHODS = [:dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position]
6
6
 
7
+ attr_accessor :pivot
8
+
7
9
  def dice; super&.round(3); end
8
10
  def levenshtein; super&.round(3); end
9
11
  def jaro_winkler; super&.round(3); end
@@ -11,6 +13,12 @@ module Eco
11
13
  def words_ngrams; super&.round(3); end
12
14
  def chars_position; super&.round(3); end
13
15
 
16
+ #Shortcuts
17
+ def lev; levenshtein; end
18
+ def jaro; jaro_winkler; end
19
+ def wngrams; words_ngrams; end
20
+ def pos; chars_position; end
21
+
14
22
  def average
15
23
  values = [dice, levenshtein, jaro_winkler, ngrams, words_ngrams, chars_position]
16
24
  (values.inject(0.0, :+) / values.length).round(3)
@@ -55,6 +63,12 @@ module Eco
55
63
  compare(result)
56
64
  end
57
65
 
66
+ def values_at(*keys)
67
+ keys.map do |key|
68
+ self.send(key) if self.respond_to?(key)
69
+ end
70
+ end
71
+
58
72
  private
59
73
 
60
74
  def compare(other, order: self.order)
@@ -2,9 +2,27 @@ module Eco
2
2
  module Data
3
3
  module FuzzyMatch
4
4
  class Results < Struct.new(:needle, :value, :raw_results)
5
+ include Enumerable
5
6
 
6
7
  attr_accessor :threshold
7
8
 
9
+ def empty?
10
+ count < 1
11
+ end
12
+
13
+ def each(&block)
14
+ return to_enum(:each) unless block
15
+ raw_results.each(&block)
16
+ end
17
+
18
+ # Merges the results of both Results object
19
+ def merge(res)
20
+ unless self.needle == res.needle
21
+ raise "To merge 2 Results, needle should be the same ('#{value}'). Given '#{res.value}'"
22
+ end
23
+ self.class.new(needle, value, raw_results.concat(res.raw_results))
24
+ end
25
+
8
26
  def results_with_false_positives
9
27
  relevant_results(order: :jaro_winkler, threshold: 0.5)
10
28
  end
@@ -4,7 +4,10 @@ module Eco
4
4
  class Score < Struct.new(:score, :total)
5
5
 
6
6
  def ratio(decimals = 6)
7
- ((score || 0).to_f / (total || 1)).round(decimals)
7
+ tot = self.total; sc = self.score
8
+ tot = tot && tot > 0 ? tot : 1
9
+ sc = sc && sc > 0 ? sc : 0
10
+ (sc.to_f / tot).round(decimals)
8
11
  end
9
12
 
10
13
  def percent(decimals = 3)
@@ -13,6 +16,8 @@ module Eco
13
16
 
14
17
  def increase(value = 1)
15
18
  self.score += value
19
+ raise "Score #{self.score} (increase: #{value}) can't be greater than total #{self.total}" if self.score > self.total
20
+ self.score
16
21
  end
17
22
 
18
23
  def increase_total(value)
@@ -26,14 +31,14 @@ module Eco
26
31
  end
27
32
 
28
33
  # Merges 2 Score instance objects
29
- def merge(value)
30
- Score.new(*values_at(:score, :total)).merge!(value)
34
+ def merge(scr)
35
+ Score.new(*values_at(:score, :total)).merge!(scr)
31
36
  end
32
37
 
33
- def merge!(value)
34
- raise "Expecting Score object. Given: #{value.class}" unless value.is_a?(Score)
35
- increase(value.score)
36
- increase_total(value.total)
38
+ def merge!(scr)
39
+ raise "Expecting Score object. Given: #{scr.class}" unless scr.is_a?(Score)
40
+ increase_total(scr.total)
41
+ increase(scr.score)
37
42
  self
38
43
  end
39
44
 
@@ -17,7 +17,7 @@ module Eco
17
17
  def get_words(str, normalized: false)
18
18
  return [] unless str
19
19
  str = normalize_string(str) unless normalized
20
- str.scan(/[a-zA-Z'-]+/)
20
+ str.scan(/[a-zA-Z'-]+/).compact
21
21
  end
22
22
 
23
23
  # Keeps the start order of the `words` and consecutive `words` together/consecutive.
@@ -63,6 +63,19 @@ module Eco
63
63
  str.tr(' ', '')
64
64
  end
65
65
 
66
+ # Deletes the words of `str1` and `str2` that match
67
+ # @return [Array<String>] pair of words.
68
+ def remove_matching_words(str1, str2, normalized: false)
69
+ unless normalized
70
+ str1 = normalize_string(str1)
71
+ str2 = normalize_string(str2)
72
+ end
73
+ return [str1, str2] if !str1 || !str2 || str1.empty? || str2.empty?
74
+ ws1 = get_words(str1)
75
+ ws2 = get_words(str2)
76
+ [(ws1 - ws2).join(" "), (ws2 - ws1).join(" ")]
77
+ end
78
+
66
79
  end
67
80
  end
68
81
  end
@@ -134,8 +134,11 @@ module Eco
134
134
  to_a.group_by(&block) if block
135
135
  end
136
136
 
137
- def to_h(attr)
138
- return {} if !attr
137
+ # By a specific `attr` or a block
138
+ # @note either one or the other should be present
139
+ def to_h(attr, &block)
140
+ return to_a.group_by(&block) if block
141
+ raise "And attr or a block are required. Given attr: #{attr}" unless attr
139
142
  to_a.group_by { |object| object.method(attr).call }
140
143
  end
141
144
  # @!endgroup
data/lib/eco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eco
2
- VERSION = "2.0.19"
2
+ VERSION = "2.0.25"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eco-helpers
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.19
4
+ version: 2.0.25
5
5
  platform: ruby
6
6
  authors:
7
7
  - Oscar Segura
@@ -230,6 +230,26 @@ dependencies:
230
230
  - - "<"
231
231
  - !ruby/object:Gem::Version
232
232
  version: '3.1'
233
+ - !ruby/object:Gem::Dependency
234
+ name: hashdiff
235
+ requirement: !ruby/object:Gem::Requirement
236
+ requirements:
237
+ - - ">="
238
+ - !ruby/object:Gem::Version
239
+ version: 1.0.1
240
+ - - "<"
241
+ - !ruby/object:Gem::Version
242
+ version: '1.1'
243
+ type: :runtime
244
+ prerelease: false
245
+ version_requirements: !ruby/object:Gem::Requirement
246
+ requirements:
247
+ - - ">="
248
+ - !ruby/object:Gem::Version
249
+ version: 1.0.1
250
+ - - "<"
251
+ - !ruby/object:Gem::Version
252
+ version: '1.1'
233
253
  - !ruby/object:Gem::Dependency
234
254
  name: fuzzy_match
235
255
  requirement: !ruby/object:Gem::Requirement
@@ -290,6 +310,46 @@ dependencies:
290
310
  - - "<"
291
311
  - !ruby/object:Gem::Version
292
312
  version: '1.6'
313
+ - !ruby/object:Gem::Dependency
314
+ name: roo
315
+ requirement: !ruby/object:Gem::Requirement
316
+ requirements:
317
+ - - ">="
318
+ - !ruby/object:Gem::Version
319
+ version: 2.8.3
320
+ - - "<"
321
+ - !ruby/object:Gem::Version
322
+ version: '2.9'
323
+ type: :runtime
324
+ prerelease: false
325
+ version_requirements: !ruby/object:Gem::Requirement
326
+ requirements:
327
+ - - ">="
328
+ - !ruby/object:Gem::Version
329
+ version: 2.8.3
330
+ - - "<"
331
+ - !ruby/object:Gem::Version
332
+ version: '2.9'
333
+ - !ruby/object:Gem::Dependency
334
+ name: roo-xls
335
+ requirement: !ruby/object:Gem::Requirement
336
+ requirements:
337
+ - - ">="
338
+ - !ruby/object:Gem::Version
339
+ version: 1.2.0
340
+ - - "<"
341
+ - !ruby/object:Gem::Version
342
+ version: '1.3'
343
+ type: :runtime
344
+ prerelease: false
345
+ version_requirements: !ruby/object:Gem::Requirement
346
+ requirements:
347
+ - - ">="
348
+ - !ruby/object:Gem::Version
349
+ version: 1.2.0
350
+ - - "<"
351
+ - !ruby/object:Gem::Version
352
+ version: '1.3'
293
353
  description:
294
354
  email:
295
355
  - oscar@ecoportal.co.nz
@@ -332,6 +392,7 @@ files:
332
392
  - lib/eco/api/common/people/default_parsers/policy_groups_parser.rb
333
393
  - lib/eco/api/common/people/default_parsers/select_parser.rb
334
394
  - lib/eco/api/common/people/default_parsers/send_invites_parser.rb
395
+ - lib/eco/api/common/people/default_parsers/xls_parser.rb
335
396
  - lib/eco/api/common/people/entries.rb
336
397
  - lib/eco/api/common/people/entry_factory.rb
337
398
  - lib/eco/api/common/people/person_attribute_parser.rb
@@ -443,6 +504,7 @@ files:
443
504
  - lib/eco/api/usecases/default_cases/analyse_people_case.rb
444
505
  - lib/eco/api/usecases/default_cases/append_usergroups_case.rb
445
506
  - lib/eco/api/usecases/default_cases/change_email_case.rb
507
+ - lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb
446
508
  - lib/eco/api/usecases/default_cases/codes_to_tags_case.rb
447
509
  - lib/eco/api/usecases/default_cases/create_case.rb
448
510
  - lib/eco/api/usecases/default_cases/create_details_case.rb
@@ -547,7 +609,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
547
609
  requirements:
548
610
  - - ">="
549
611
  - !ruby/object:Gem::Version
550
- version: 2.4.4
612
+ version: 2.5.0
551
613
  required_rubygems_version: !ruby/object:Gem::Requirement
552
614
  requirements:
553
615
  - - ">="