eco-helpers 2.0.18 → 2.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +14 -1
  3. data/lib/eco/api/common/people/entry_factory.rb +26 -9
  4. data/lib/eco/api/common/people/person_entry.rb +1 -0
  5. data/lib/eco/api/common/session.rb +1 -0
  6. data/lib/eco/api/common/session/base_session.rb +2 -0
  7. data/lib/eco/api/common/session/helpers.rb +30 -0
  8. data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
  9. data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
  10. data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
  11. data/lib/eco/api/microcases/with_each.rb +67 -6
  12. data/lib/eco/api/microcases/with_each_present.rb +4 -2
  13. data/lib/eco/api/microcases/with_each_starter.rb +4 -2
  14. data/lib/eco/api/organization.rb +1 -1
  15. data/lib/eco/api/organization/people.rb +92 -23
  16. data/lib/eco/api/organization/people_similarity.rb +112 -0
  17. data/lib/eco/api/organization/person_schemas.rb +5 -1
  18. data/lib/eco/api/organization/policy_groups.rb +5 -1
  19. data/lib/eco/api/session.rb +5 -2
  20. data/lib/eco/api/session/batch.rb +7 -5
  21. data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +12 -35
  22. data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
  23. data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
  24. data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
  25. data/lib/eco/cli/config/default/options.rb +2 -1
  26. data/lib/eco/cli/config/default/usecases.rb +2 -0
  27. data/lib/eco/cli/config/default/workflow.rb +4 -1
  28. data/lib/eco/csv.rb +4 -2
  29. data/lib/eco/data/fuzzy_match.rb +63 -21
  30. data/lib/eco/data/fuzzy_match/ngrams_score.rb +7 -2
  31. data/lib/eco/data/fuzzy_match/pairing.rb +0 -1
  32. data/lib/eco/data/fuzzy_match/result.rb +7 -1
  33. data/lib/eco/data/fuzzy_match/results.rb +12 -6
  34. data/lib/eco/version.rb +1 -1
  35. metadata +4 -2
  36. data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -35,7 +35,8 @@ ASSETS.cli.config do |cnf|
35
35
  session.schema = sch_id
36
36
  end
37
37
 
38
- desc = "Deprecated: used to be used to specify the input file when using -get-partial"
38
+ desc = "Used to be used to specify the input file when using -get-partial. "
39
+ desc += "It can also be useful to obtain `-get-partial` of people base on `:export` use cases (i.e. -people-to-csv)"
39
40
  options_set.add("-entries-from", desc) do |options, session|
40
41
  options.deep_merge!(input: {entries_from: true})
41
42
  end
@@ -48,6 +48,8 @@ ASSETS.cli.config do |cnf|
48
48
  options.deep_merge!(export: {options: {detailed: true}})
49
49
  end.add_option("-permissions-custom", "Used with -detailed. Adds the permissions_custom abilities") do |options|
50
50
  options.deep_merge!(export: {options: {permissions_custom: true}})
51
+ end.add_option("-split-schemas", "It will generate 1 file per each schema") do |options|
52
+ options.deep_merge!(export: {options: {split_schemas: true}})
51
53
  end
52
54
 
53
55
  desc = "Adds a column 'ecoPortalTag' to the input CSV with the tags that the location codes map to"
@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
28
28
  cases_with_input = config.usecases.active(io: io).select do |usecase, data|
29
29
  io.class.input_required?(usecase.type)
30
30
  end
31
- next io unless (!io.input || io.input.empty?) && !cases_with_input.empty?
31
+
32
+ input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
33
+ missing_input = !io.input || io.input.empty?
34
+ next io unless missing_input && input_is_required
32
35
 
33
36
  if io.options.dig(:input, :entries_from)
34
37
  io = io.new(input: config.input.get(io: io))
data/lib/eco/csv.rb CHANGED
@@ -18,8 +18,10 @@ module Eco
18
18
  kargs = {headers: true, skip_blanks: true}.merge(kargs)
19
19
 
20
20
  args = [file].tap do |arg|
21
- coding = Eco::API::Common::Session::FileManager.encoding(file)
22
- arg.push("rb:bom|utf-8") if coding == "bom"
21
+ encoding = Eco::API::Common::Session::FileManager.encoding(file)
22
+ #encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
23
+ #arg.push(encoding)
24
+ arg.push("rb:bom|utf-8") if encoding == "bom"
23
25
  end
24
26
 
25
27
  out = super(*args, **kargs).reject do |row|
@@ -27,17 +27,28 @@ module Eco
27
27
  include CharsPositionScore
28
28
  include NGramsScore
29
29
 
30
- def jaro_winkler(str1, str2)
30
+ def jaro_winkler(str1, str2, **options)
31
31
  options = {
32
32
  ignore_case: true,
33
33
  weight: 0.25
34
- }
34
+ }.merge(options)
35
35
  JaroWinkler.distance(str1, str2, **options)
36
36
  end
37
37
 
38
38
  end
39
39
 
40
40
  module InstanceMethods
41
+ FUZZY_MATCH_OPTIONS = [
42
+ :identities, :groupings, :stop_words, :read,
43
+ :must_match_grouping, :must_match_at_least_one_word,
44
+ :gather_last_result, :threshold
45
+ ]
46
+
47
+ JARO_OPTIONS = [:ignore_case, :weight]
48
+ NGRAMS_OPTIONS = [:range]
49
+ POSITION_OPTIONS = [:max_distance]
50
+ RESULTS_OPTIONS = [:order, :threshold]
51
+
41
52
  include StopWords
42
53
 
43
54
  attr_accessor :fuzzy_options
@@ -46,19 +57,14 @@ module Eco
46
57
  @fuzzy_options ||= {}
47
58
  end
48
59
 
49
- def fuzzy_match(haystack = nil, **options)
50
- return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
51
- @fuzzy_options = options.merge({
52
- stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
53
- })
60
+ def fuzzy_match(haystack_data = nil, **options)
61
+ if instance_variable_defined?(:@fuzzy_match) && !haystack_data
62
+ return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
63
+ end
64
+ @fuzzy_options = options
54
65
  # make it run with a native C extension (for better performance: ~130 % increase of performance)
55
66
  ::FuzzyMatch.engine = :amatch
56
- haystack = obtain_haystack(haystack).tap do |items|
57
- if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
58
- raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
59
- end
60
- end
61
- @fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
67
+ @fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
62
68
  end
63
69
 
64
70
  # @note
@@ -71,37 +77,73 @@ module Eco
71
77
  unless item == needle
72
78
  needle_str = item_string(needle)
73
79
  item_str = item_string(item)
74
- jaro_res = self.class.jaro_winkler(needle_str, item_str)
75
- ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
76
- wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
77
- pos_res = self.class.chars_position_score(needle_str, item_str).ratio
80
+ jaro_res = jaro(needle_str, item_str)
81
+ ngram_res = ngram(needle_str, item_str)
82
+ wngram_res = words_ngram(needle_str, item_str)
83
+ pos_res = position(needle_str, item_str)
84
+
78
85
  results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
79
86
  end
80
87
  end
81
- Results.new(needle, item_string(needle), results)
88
+ Results.new(needle, item_string(needle), results).tap do |res|
89
+ res.order = fuzzy_options[:order] if fuzzy_options[:order]
90
+ res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
91
+ end
82
92
  end
83
93
 
84
94
  private
85
95
 
96
+ def jaro(str1, str2)
97
+ options = fuzzy_options.slice(*JARO_OPTIONS)
98
+ self.class.jaro_winkler(str1, str2, **options)
99
+ end
100
+
101
+ def ngram(str1, str2)
102
+ options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
103
+ self.class.ngrams_score(str1, str2, **options).ratio
104
+ end
105
+
106
+ def words_ngram(str1, str2)
107
+ options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
108
+ self.class.words_ngrams_score(str1, str2, **options).ratio
109
+ end
110
+
111
+ def position(str1, str2)
112
+ options = fuzzy_options.slice(*POSITION_OPTIONS)
113
+ self.class.chars_position_score(str1, str2, **options).ratio
114
+ end
115
+
86
116
  # @note
87
117
  # - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
88
118
  # @param data [Enumerable, nil]
89
119
  # @return [Array<Object>] the non-repeated values of `data`
90
- def obtain_haystack(data = nil)
120
+ def haystack(data = nil)
91
121
  data = self if self.is_a?(Enumerable) && !data
92
122
  raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
93
123
  data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
94
- data.uniq.compact
124
+ data.uniq.compact.tap do |items|
125
+ if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
126
+ raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
127
+ end
128
+ end
95
129
  end
96
130
 
97
131
  def item_string(item, attr = fuzzy_read_method)
98
132
  return item if !item || item.is_a?(String) || !attr
133
+ return attr.call(item) if attr.is_a?(Proc)
99
134
  attr = attr.to_sym
100
135
  return item.send(attr) if item.respond_to?(attr)
101
136
  end
102
137
 
138
+ def fuzzy_match_options(options = nil)
139
+ options = fuzzy_options unless options
140
+ options.slice(*FUZZY_MATCH_OPTIONS).merge({
141
+ stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
142
+ })
143
+ end
144
+
103
145
  def fuzzy_read_method
104
- fuzzy_options[:read]
146
+ fuzzy_match_options[:read]
105
147
  end
106
148
 
107
149
  end
@@ -16,8 +16,13 @@ module Eco
16
16
 
17
17
  Score.new(0, 0).tap do |score|
18
18
  next if !str2 || !str1
19
- next score.increase(score.total) if str1 == str2
20
- next if str1.length < 2 || str1.length < 2
19
+ if str1 == str2
20
+ score.increase_total(len1)
21
+ score.increase(score.total)
22
+ end
23
+ if str1.length < 2 || str1.length < 2
24
+ score.increase_total(len1)
25
+ end
21
26
 
22
27
  paired_words(str1, str2, normalized: true) do |needle, item|
23
28
  ngrams_score(needle, item, range: range, normalized: true)
@@ -27,7 +27,6 @@ module Eco
27
27
  def paired_words(str1, str2, format: [:pair, :score], normalized: false)
28
28
  str1, str2 = normalize_string([str1, str2]) unless normalized
29
29
  return {} if !str2 || !str1
30
- return score.increase(score.total) if str1 == str2
31
30
  return {str1 => nil} if str1.length < 2 || str1.length < 2
32
31
 
33
32
  needles = get_words(str1, normalized: true)
@@ -11,12 +11,18 @@ module Eco
11
11
  def words_ngrams; super&.round(3); end
12
12
  def chars_position; super&.round(3); end
13
13
 
14
+ def average
15
+ values = [dice, levenshtein, jaro_winkler, ngrams, words_ngrams, chars_position]
16
+ (values.inject(0.0, :+) / values.length).round(3)
17
+ end
18
+
14
19
  # TODO: print in the order of `order`
15
20
  def print
16
21
  msg = "(Dice: #{dice}) (Lev Dst: #{levenshtein}) "
17
22
  msg << "(Jaro: #{jaro_winkler}) "
18
23
  msg << "(Ngram: #{ngrams}) (WNgrams: #{words_ngrams}) "
19
24
  msg << "(C Pos: #{chars_position}) "
25
+ msg << "(Avg: #{average}) "
20
26
  msg << "'#{value}'"
21
27
  end
22
28
 
@@ -37,7 +43,7 @@ module Eco
37
43
 
38
44
  def order=(values)
39
45
  @order = [values].flatten.compact.tap do |o|
40
- o = [:words_ngrams, :dice] if o.empty?
46
+ o << [:words_ngrams, :dice] if o.empty?
41
47
  end
42
48
  end
43
49
 
@@ -3,23 +3,29 @@ module Eco
3
3
  module FuzzyMatch
4
4
  class Results < Struct.new(:needle, :value, :raw_results)
5
5
 
6
+ attr_accessor :threshold
7
+
6
8
  def results_with_false_positives
7
- relevant_results(methods: :jaro_winkler, threshold: 0.5)
9
+ relevant_results(order: :jaro_winkler, threshold: 0.5)
8
10
  end
9
11
 
10
- def relevant_results(methods: order, threshold: 0.5)
12
+ def relevant_results(**options)
13
+ options = {order: order, threshold: threshold || 0.5}.merge(options)
11
14
  raw_results.select do |result|
12
- result.all_threshold?(methods, threshold)
15
+ result.all_threshold?(options[:order], options[:threshold])
13
16
  end.yield_self do |filtered|
14
17
  self.class.new(needle, value, filtered).tap do |results|
15
- results.order = methods
18
+ results.order = options[:order]
16
19
  end
17
20
  end
18
21
  end
19
22
 
23
+ # @param values[Array<Symbol>] the algorithms' results it should be ordered by
24
+ # * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`, `:average`
20
25
  def order=(values)
21
- @order = [values].flatten.compact
22
- raw_results.each {|r| r.order = @order}
26
+ @order = [values].flatten.compact.tap do |o|
27
+ raw_results.each {|r| r.order = o}
28
+ end
23
29
  end
24
30
 
25
31
  def order
data/lib/eco/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eco
2
- VERSION = "2.0.18"
2
+ VERSION = "2.0.19"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eco-helpers
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.18
4
+ version: 2.0.19
5
5
  platform: ruby
6
6
  authors:
7
7
  - Oscar Segura
@@ -345,6 +345,8 @@ files:
345
345
  - lib/eco/api/common/session/base_session.rb
346
346
  - lib/eco/api/common/session/environment.rb
347
347
  - lib/eco/api/common/session/file_manager.rb
348
+ - lib/eco/api/common/session/helpers.rb
349
+ - lib/eco/api/common/session/helpers/prompt_user.rb
348
350
  - lib/eco/api/common/session/logger.rb
349
351
  - lib/eco/api/common/session/logger/cache.rb
350
352
  - lib/eco/api/common/session/logger/log.rb
@@ -397,7 +399,7 @@ files:
397
399
  - lib/eco/api/organization.rb
398
400
  - lib/eco/api/organization/login_providers.rb
399
401
  - lib/eco/api/organization/people.rb
400
- - lib/eco/api/organization/people_analytics.rb
402
+ - lib/eco/api/organization/people_similarity.rb
401
403
  - lib/eco/api/organization/person_schemas.rb
402
404
  - lib/eco/api/organization/policy_groups.rb
403
405
  - lib/eco/api/organization/preferences.rb
@@ -1,60 +0,0 @@
1
- module Eco
2
- module API
3
- module Organization
4
- class PeopleAnalytics < Eco::API::Organization::People
5
- include Eco::Data::FuzzyMatch
6
-
7
- # @!group Helpers
8
-
9
- # @!endgroup
10
-
11
- # @!group Searchers
12
-
13
- # It gathers those that have the same `email`
14
- # @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
15
- def repeated_emails
16
- init_caches
17
- @by_email.select do |email, people|
18
- people.count > 1
19
- end
20
- end
21
-
22
- # @!endgroup
23
-
24
- # @!group Analysers
25
-
26
- # TODO: Sort results by `results.first.methods`
27
- def similarity(**options)
28
- each_with_object({}) do |person, results|
29
- results[person.id] = find_all_with_score(person, **options)
30
- end
31
- end
32
-
33
-
34
- def print_analysis(threshold)
35
- similarity.each do |id, results|
36
- msg = results.results.select do |result|
37
- result.threshold?(threshold)
38
- end.map do |result|
39
- result.print
40
- end.join("\n ")
41
-
42
- puts "'#{self[id].identify}':\n " + msg
43
- end
44
- end
45
- # @!endgroup
46
-
47
- protected
48
-
49
- def on_change
50
- remove_instance_variable(@fuzzy_match)
51
- super
52
- end
53
-
54
- private
55
-
56
-
57
- end
58
- end
59
- end
60
- end