eco-helpers 2.0.18 → 2.0.19

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -1
  3. data/lib/eco/api/common/people/entry_factory.rb +26 -9
  4. data/lib/eco/api/common/people/person_entry.rb +1 -0
  5. data/lib/eco/api/common/session.rb +1 -0
  6. data/lib/eco/api/common/session/base_session.rb +2 -0
  7. data/lib/eco/api/common/session/helpers.rb +30 -0
  8. data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
  9. data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
  10. data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
  11. data/lib/eco/api/microcases/with_each.rb +67 -6
  12. data/lib/eco/api/microcases/with_each_present.rb +4 -2
  13. data/lib/eco/api/microcases/with_each_starter.rb +4 -2
  14. data/lib/eco/api/organization.rb +1 -1
  15. data/lib/eco/api/organization/people.rb +92 -23
  16. data/lib/eco/api/organization/people_similarity.rb +112 -0
  17. data/lib/eco/api/organization/person_schemas.rb +5 -1
  18. data/lib/eco/api/organization/policy_groups.rb +5 -1
  19. data/lib/eco/api/session.rb +5 -2
  20. data/lib/eco/api/session/batch.rb +7 -5
  21. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +12 -35
  22. data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
  23. data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
  24. data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
  25. data/lib/eco/cli/config/default/options.rb +2 -1
  26. data/lib/eco/cli/config/default/usecases.rb +2 -0
  27. data/lib/eco/cli/config/default/workflow.rb +4 -1
  28. data/lib/eco/csv.rb +4 -2
  29. data/lib/eco/data/fuzzy_match.rb +63 -21
  30. data/lib/eco/data/fuzzy_match/ngrams_score.rb +7 -2
  31. data/lib/eco/data/fuzzy_match/pairing.rb +0 -1
  32. data/lib/eco/data/fuzzy_match/result.rb +7 -1
  33. data/lib/eco/data/fuzzy_match/results.rb +12 -6
  34. data/lib/eco/version.rb +1 -1
  35. metadata +4 -2
  36. data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -35,7 +35,8 @@ ASSETS.cli.config do |cnf|
35
35
  session.schema = sch_id
36
36
  end
37
37
 
38
- desc = "Deprecated: used to be used to specify the input file when using -get-partial"
38
+ desc = "Used to be used to specify the input file when using -get-partial. "
39
+ desc += "It can also be useful to obtain `-get-partial` of people base on `:export` use cases (i.e. -people-to-csv)"
39
40
  options_set.add("-entries-from", desc) do |options, session|
40
41
  options.deep_merge!(input: {entries_from: true})
41
42
  end
@@ -48,6 +48,8 @@ ASSETS.cli.config do |cnf|
48
48
  options.deep_merge!(export: {options: {detailed: true}})
49
49
  end.add_option("-permissions-custom", "Used with -detailed. Adds the permissions_custom abilities") do |options|
50
50
  options.deep_merge!(export: {options: {permissions_custom: true}})
51
+ end.add_option("-split-schemas", "It will generate 1 file per each schema") do |options|
52
+ options.deep_merge!(export: {options: {split_schemas: true}})
51
53
  end
52
54
 
53
55
  desc = "Adds a column 'ecoPortalTag' to the input CSV with the tags that the location codes map to"
@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
28
28
  cases_with_input = config.usecases.active(io: io).select do |usecase, data|
29
29
  io.class.input_required?(usecase.type)
30
30
  end
31
- next io unless (!io.input || io.input.empty?) && !cases_with_input.empty?
31
+
32
+ input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
33
+ missing_input = !io.input || io.input.empty?
34
+ next io unless missing_input && input_is_required
32
35
 
33
36
  if io.options.dig(:input, :entries_from)
34
37
  io = io.new(input: config.input.get(io: io))
data/lib/eco/csv.rb CHANGED
@@ -18,8 +18,10 @@ module Eco
18
18
  kargs = {headers: true, skip_blanks: true}.merge(kargs)
19
19
 
20
20
  args = [file].tap do |arg|
21
- coding = Eco::API::Common::Session::FileManager.encoding(file)
22
- arg.push("rb:bom|utf-8") if coding == "bom"
21
+ encoding = Eco::API::Common::Session::FileManager.encoding(file)
22
+ #encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
23
+ #arg.push(encoding)
24
+ arg.push("rb:bom|utf-8") if encoding == "bom"
23
25
  end
24
26
 
25
27
  out = super(*args, **kargs).reject do |row|
@@ -27,17 +27,28 @@ module Eco
27
27
  include CharsPositionScore
28
28
  include NGramsScore
29
29
 
30
- def jaro_winkler(str1, str2)
30
+ def jaro_winkler(str1, str2, **options)
31
31
  options = {
32
32
  ignore_case: true,
33
33
  weight: 0.25
34
- }
34
+ }.merge(options)
35
35
  JaroWinkler.distance(str1, str2, **options)
36
36
  end
37
37
 
38
38
  end
39
39
 
40
40
  module InstanceMethods
41
+ FUZZY_MATCH_OPTIONS = [
42
+ :identities, :groupings, :stop_words, :read,
43
+ :must_match_grouping, :must_match_at_least_one_word,
44
+ :gather_last_result, :threshold
45
+ ]
46
+
47
+ JARO_OPTIONS = [:ignore_case, :weight]
48
+ NGRAMS_OPTIONS = [:range]
49
+ POSITION_OPTIONS = [:max_distance]
50
+ RESULTS_OPTIONS = [:order, :threshold]
51
+
41
52
  include StopWords
42
53
 
43
54
  attr_accessor :fuzzy_options
@@ -46,19 +57,14 @@ module Eco
46
57
  @fuzzy_options ||= {}
47
58
  end
48
59
 
49
- def fuzzy_match(haystack = nil, **options)
50
- return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
51
- @fuzzy_options = options.merge({
52
- stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
53
- })
60
+ def fuzzy_match(haystack_data = nil, **options)
61
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
62
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
63
+ end
64
+ @fuzzy_options = options
54
65
  # make it run with a native C extension (for better performance: ~130 % increase of performance)
55
66
  ::FuzzyMatch.engine = :amatch
56
- haystack = obtain_haystack(haystack).tap do |items|
57
- if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
58
- raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
59
- end
60
- end
61
- @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
67
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
62
68
  end
63
69
 
64
70
  # @note
@@ -71,37 +77,73 @@ module Eco
71
77
  unless item == needle
72
78
  needle_str = item_string(needle)
73
79
  item_str = item_string(item)
74
- jaro_res = self.class.jaro_winkler(needle_str, item_str)
75
- ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
76
- wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
77
- pos_res = self.class.chars_position_score(needle_str, item_str).ratio
80
+ jaro_res = jaro(needle_str, item_str)
81
+ ngram_res = ngram(needle_str, item_str)
82
+ wngram_res = words_ngram(needle_str, item_str)
83
+ pos_res = position(needle_str, item_str)
84
+
78
85
  results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
79
86
  end
80
87
  end
81
- Results.new(needle, item_string(needle), results)
88
+ Results.new(needle, item_string(needle), results).tap do |res|
89
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
90
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
91
+ end
82
92
  end
83
93
 
84
94
  private
85
95
 
96
+ def jaro(str1, str2)
97
+ options = fuzzy_options.slice(*JARO_OPTIONS)
98
+ self.class.jaro_winkler(str1, str2, **options)
99
+ end
100
+
101
+ def ngram(str1, str2)
102
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
103
+ self.class.ngrams_score(str1, str2, **options).ratio
104
+ end
105
+
106
+ def words_ngram(str1, str2)
107
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
108
+ self.class.words_ngrams_score(str1, str2, **options).ratio
109
+ end
110
+
111
+ def position(str1, str2)
112
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
113
+ self.class.chars_position_score(str1, str2, **options).ratio
114
+ end
115
+
86
116
  # @note
87
117
  # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
88
118
  # @param data [Enumerable, nil]
89
119
  # @return [Array<Object>] the non-repeated values of `data`
90
- def obtain_haystack(data = nil)
120
+ def haystack(data = nil)
91
121
  data = self if self.is_a?(Enumerable) && !data
92
122
  raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
93
123
  data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
94
- data.uniq.compact
124
+ data.uniq.compact.tap do |items|
125
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
126
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
127
+ end
128
+ end
95
129
  end
96
130
 
97
131
  def item_string(item, attr = fuzzy_read_method)
98
132
  return item if !item || item.is_a?(String) || !attr
133
+ return attr.call(item) if attr.is_a?(Proc)
99
134
  attr = attr.to_sym
100
135
  return item.send(attr) if item.respond_to?(attr)
101
136
  end
102
137
 
138
+ def fuzzy_match_options(options = nil)
139
+ options = fuzzy_options unless options
140
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
141
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
142
+ })
143
+ end
144
+
103
145
  def fuzzy_read_method
104
- fuzzy_options[:read]
146
+ fuzzy_match_options[:read]
105
147
  end
106
148
 
107
149
  end
@@ -16,8 +16,13 @@ module Eco
16
16
 
17
17
  Score.new(0, 0).tap do |score|
18
18
  next if !str2 || !str1
19
- next score.increase(score.total) if str1 == str2
20
- next if str1.length < 2 || str1.length < 2
19
+ if str1 == str2
20
+ score.increase_total(len1)
21
+ score.increase(score.total)
22
+ end
23
+ if str1.length < 2 || str1.length < 2
24
+ score.increase_total(len1)
25
+ end
21
26
 
22
27
  paired_words(str1, str2, normalized: true) do |needle, item|
23
28
  ngrams_score(needle, item, range: range, normalized: true)
@@ -27,7 +27,6 @@ module Eco
27
27
  def paired_words(str1, str2, format: [:pair, :score], normalized: false)
28
28
  str1, str2 = normalize_string([str1, str2]) unless normalized
29
29
  return {} if !str2 || !str1
30
- return score.increase(score.total) if str1 == str2
31
30
  return {str1 => nil} if str1.length < 2 || str1.length < 2
32
31
 
33
32
  needles = get_words(str1, normalized: true)
@@ -11,12 +11,18 @@ module Eco
11
11
  def words_ngrams; super&.round(3); end
12
12
  def chars_position; super&.round(3); end
13
13
 
14
+ def average
15
+ values = [dice, levenshtein, jaro_winkler, ngrams, words_ngrams, chars_position]
16
+ (values.inject(0.0, :+) / values.length).round(3)
17
+ end
18
+
14
19
  # TODO: print in the order of `order`
15
20
  def print
16
21
  msg = "(Dice: #{dice}) (Lev Dst: #{levenshtein}) "
17
22
  msg << "(Jaro: #{jaro_winkler}) "
18
23
  msg << "(Ngram: #{ngrams}) (WNgrams: #{words_ngrams}) "
19
24
  msg << "(C Pos: #{chars_position}) "
25
+ msg << "(Avg: #{average}) "
20
26
  msg << "'#{value}'"
21
27
  end
22
28
 
@@ -37,7 +43,7 @@ module Eco
37
43
 
38
44
  def order=(values)
39
45
  @order = [values].flatten.compact.tap do |o|
40
- o = [:words_ngrams, :dice] if o.empty?
46
+ o << [:words_ngrams, :dice] if o.empty?
41
47
  end
42
48
  end
43
49
 
@@ -3,23 +3,29 @@ module Eco
3
3
  module FuzzyMatch
4
4
  class Results < Struct.new(:needle, :value, :raw_results)
5
5
 
6
+ attr_accessor :threshold
7
+
6
8
  def results_with_false_positives
7
- relevant_results(methods: :jaro_winkler, threshold: 0.5)
9
+ relevant_results(order: :jaro_winkler, threshold: 0.5)
8
10
  end
9
11
 
10
- def relevant_results(methods: order, threshold: 0.5)
12
+ def relevant_results(**options)
13
+ options = {order: order, threshold: threshold || 0.5}.merge(options)
11
14
  raw_results.select do |result|
12
- result.all_threshold?(methods, threshold)
15
+ result.all_threshold?(options[:order], options[:threshold])
13
16
  end.yield_self do |filtered|
14
17
  self.class.new(needle, value, filtered).tap do |results|
15
- results.order = methods
18
+ results.order = options[:order]
16
19
  end
17
20
  end
18
21
  end
19
22
 
23
+ # @param values[Array<Symbol>] the algorithms' results it should be ordered by
24
+ # * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`, `:average`
20
25
  def order=(values)
21
- @order = [values].flatten.compact
22
- raw_results.each {|r| r.order = @order}
26
+ @order = [values].flatten.compact.tap do |o|
27
+ raw_results.each {|r| r.order = o}
28
+ end
23
29
  end
24
30
 
25
31
  def order
data/lib/eco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eco
2
- VERSION = "2.0.18"
2
+ VERSION = "2.0.19"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eco-helpers
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.18
4
+ version: 2.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Oscar Segura
@@ -345,6 +345,8 @@ files:
345
345
  - lib/eco/api/common/session/base_session.rb
346
346
  - lib/eco/api/common/session/environment.rb
347
347
  - lib/eco/api/common/session/file_manager.rb
348
+ - lib/eco/api/common/session/helpers.rb
349
+ - lib/eco/api/common/session/helpers/prompt_user.rb
348
350
  - lib/eco/api/common/session/logger.rb
349
351
  - lib/eco/api/common/session/logger/cache.rb
350
352
  - lib/eco/api/common/session/logger/log.rb
@@ -397,7 +399,7 @@ files:
397
399
  - lib/eco/api/organization.rb
398
400
  - lib/eco/api/organization/login_providers.rb
399
401
  - lib/eco/api/organization/people.rb
400
- - lib/eco/api/organization/people_analytics.rb
402
+ - lib/eco/api/organization/people_similarity.rb
401
403
  - lib/eco/api/organization/person_schemas.rb
402
404
  - lib/eco/api/organization/policy_groups.rb
403
405
  - lib/eco/api/organization/preferences.rb
@@ -1,60 +0,0 @@
1
- module Eco
2
- module API
3
- module Organization
4
- class PeopleAnalytics < Eco::API::Organization::People
5
- include Eco::Data::FuzzyMatch
6
-
7
- # @!group Helpers
8
-
9
- # @!endgroup
10
-
11
- # @!group Searchers
12
-
13
- # It gathers those that have the same `email`
14
- # @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
15
- def repeated_emails
16
- init_caches
17
- @by_email.select do |email, people|
18
- people.count > 1
19
- end
20
- end
21
-
22
- # @!endgroup
23
-
24
- # @!group Analysers
25
-
26
- # TODO: Sort results by `results.first.methods`
27
- def similarity(**options)
28
- each_with_object({}) do |person, results|
29
- results[person.id] = find_all_with_score(person, **options)
30
- end
31
- end
32
-
33
-
34
- def print_analysis(threshold)
35
- similarity.each do |id, results|
36
- msg = results.results.select do |result|
37
- result.threshold?(threshold)
38
- end.map do |result|
39
- result.print
40
- end.join("\n ")
41
-
42
- puts "'#{self[id].identify}':\n " + msg
43
- end
44
- end
45
- # @!endgroup
46
-
47
- protected
48
-
49
- def on_change
50
- remove_instance_variable(@fuzzy_match)
51
- super
52
- end
53
-
54
- private
55
-
56
-
57
- end
58
- end
59
- end
60
- end