eco-helpers 2.0.18 → 2.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +80 -1
- data/eco-helpers.gemspec +4 -1
- data/lib/eco/api/common/base_loader.rb +9 -5
- data/lib/eco/api/common/loaders/parser.rb +1 -0
- data/lib/eco/api/common/people/default_parsers.rb +1 -0
- data/lib/eco/api/common/people/default_parsers/xls_parser.rb +53 -0
- data/lib/eco/api/common/people/entries.rb +1 -0
- data/lib/eco/api/common/people/entry_factory.rb +88 -23
- data/lib/eco/api/common/people/person_entry.rb +1 -0
- data/lib/eco/api/common/people/person_parser.rb +1 -1
- data/lib/eco/api/common/session.rb +1 -0
- data/lib/eco/api/common/session/base_session.rb +2 -0
- data/lib/eco/api/common/session/helpers.rb +30 -0
- data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
- data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
- data/lib/eco/api/common/version_patches/exception.rb +5 -2
- data/lib/eco/api/microcases/with_each.rb +67 -6
- data/lib/eco/api/microcases/with_each_present.rb +4 -2
- data/lib/eco/api/microcases/with_each_starter.rb +4 -2
- data/lib/eco/api/organization.rb +1 -1
- data/lib/eco/api/organization/people.rb +94 -25
- data/lib/eco/api/organization/people_similarity.rb +272 -0
- data/lib/eco/api/organization/person_schemas.rb +5 -1
- data/lib/eco/api/organization/policy_groups.rb +5 -1
- data/lib/eco/api/organization/tag_tree.rb +33 -0
- data/lib/eco/api/session.rb +19 -8
- data/lib/eco/api/session/batch.rb +7 -5
- data/lib/eco/api/session/batch/job.rb +34 -9
- data/lib/eco/api/usecases.rb +2 -2
- data/lib/eco/api/usecases/base_case.rb +2 -2
- data/lib/eco/api/usecases/base_io.rb +17 -4
- data/lib/eco/api/usecases/default_cases.rb +1 -0
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +179 -32
- data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
- data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
- data/lib/eco/cli/config/default/input.rb +61 -8
- data/lib/eco/cli/config/default/options.rb +47 -2
- data/lib/eco/cli/config/default/people.rb +18 -24
- data/lib/eco/cli/config/default/usecases.rb +33 -2
- data/lib/eco/cli/config/default/workflow.rb +12 -7
- data/lib/eco/cli/scripting/args_helpers.rb +2 -2
- data/lib/eco/csv.rb +4 -2
- data/lib/eco/csv/table.rb +121 -21
- data/lib/eco/data/fuzzy_match.rb +109 -27
- data/lib/eco/data/fuzzy_match/chars_position_score.rb +3 -2
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +19 -10
- data/lib/eco/data/fuzzy_match/pairing.rb +12 -19
- data/lib/eco/data/fuzzy_match/result.rb +22 -2
- data/lib/eco/data/fuzzy_match/results.rb +30 -6
- data/lib/eco/data/fuzzy_match/score.rb +12 -7
- data/lib/eco/data/fuzzy_match/string_helpers.rb +14 -1
- data/lib/eco/version.rb +1 -1
- metadata +67 -3
- data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative 'helpers/prompt_user'
|
2
|
+
|
3
|
+
module Eco
|
4
|
+
module API
|
5
|
+
module Common
|
6
|
+
module Session
|
7
|
+
module Helpers
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def included(base)
|
11
|
+
base.send(:include, InstanceMethods)
|
12
|
+
base.extend(ClassMethods)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
module ClassMethods
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
module InstanceMethods
|
22
|
+
include Helpers::PromptUser
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'timeout'
|
2
|
+
module Eco
|
3
|
+
module API
|
4
|
+
module Common
|
5
|
+
module Session
|
6
|
+
module Helpers
|
7
|
+
module PromptUser
|
8
|
+
|
9
|
+
def prompt_user(question, default:, explanation: "", timeout: nil)
|
10
|
+
response = if config.run_mode_remote?
|
11
|
+
default
|
12
|
+
else
|
13
|
+
puts explanation
|
14
|
+
print "#{question} "
|
15
|
+
if timeout
|
16
|
+
begin
|
17
|
+
Timeout::timeout(timeout) { STDIN.gets.chop }
|
18
|
+
rescue Timeout::Error
|
19
|
+
default
|
20
|
+
end
|
21
|
+
else
|
22
|
+
STDIN.gets.chop
|
23
|
+
end
|
24
|
+
end
|
25
|
+
return response unless block_given?
|
26
|
+
yield(response)
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -11,12 +11,15 @@ module Ecoportal
|
|
11
11
|
original_doc["account"] = JSON.parse(doc["account"])
|
12
12
|
end
|
13
13
|
|
14
|
-
def new?
|
15
|
-
|
14
|
+
def new?(doc = :initial)
|
15
|
+
ref_doc = (doc == :original) ? original_doc : initial_doc
|
16
|
+
!ref_doc["details"] && !ref_doc["account"]
|
16
17
|
end
|
17
18
|
|
18
|
-
|
19
|
-
|
19
|
+
# @return [Boolean] if the account has been added, compared to `doc`
|
20
|
+
def account_added?(doc = :initial)
|
21
|
+
ref_doc = (doc == :original) ? original_doc : initial_doc
|
22
|
+
account && !ref_doc["account"]
|
20
23
|
end
|
21
24
|
|
22
25
|
end
|
@@ -2,8 +2,11 @@ class ::Exception
|
|
2
2
|
def patch_full_message
|
3
3
|
begin
|
4
4
|
msg = []
|
5
|
-
|
6
|
-
|
5
|
+
tracing = backtrace ? backtrace : []
|
6
|
+
tracing = (self.class == SystemStackError) ? tracing[1..30] : tracing[1..-1]
|
7
|
+
tracing ||= []
|
8
|
+
msg << "\n#{tracing.first} \n#{message} (#{self.class.to_s})"
|
9
|
+
tracing.each_with_index {|bt, i| msg << "#{" "*8}#{i+1}: from #{bt}"}
|
7
10
|
msg.join("\n")
|
8
11
|
rescue Exception => e
|
9
12
|
puts "Something is wrong with 'patch_full_message': #{e}"
|
@@ -12,16 +12,77 @@ module Eco
|
|
12
12
|
# @yieldparam person [Ecoportal::API::V1::Person] the found person that matches `entry`, or a new person otherwise.
|
13
13
|
# @return [Eco::API::Organization::People] all the people, including new and existing ones.
|
14
14
|
def with_each(entries, people, options)
|
15
|
-
|
16
|
-
|
17
|
-
|
15
|
+
@_skip_all_multiple_results = false
|
16
|
+
entries.each_with_object([]) do |entry, scoped|
|
17
|
+
begin
|
18
|
+
unless person = people.find(entry, strict: micro.strict_search?(options))
|
19
|
+
person = session.new_person
|
20
|
+
end
|
21
|
+
rescue Eco::API::Organization::People::MultipleSearchResults => e
|
22
|
+
unless @_skip_all_multiple_results
|
23
|
+
msg = "\n * When searching this Entry: #{entry.to_s(:identify)}"
|
24
|
+
person = _with_each_prompt_to_select_user(e.append_message(msg), entry: entry)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
if person
|
29
|
+
person.entry = entry
|
30
|
+
yield(entry, person) if block_given?
|
31
|
+
scoped << person
|
18
32
|
end
|
19
|
-
person.entry = entry
|
20
|
-
yield(entry, person) if block_given?
|
21
|
-
person
|
22
33
|
end.yield_self {|all_people| people.newFrom all_people.uniq}
|
23
34
|
end
|
24
35
|
|
36
|
+
private
|
37
|
+
|
38
|
+
def _with_each_prompt_to_select_user(error, entry: nil, increase_count: true)
|
39
|
+
unless error.is_a?(Eco::API::Organization::People::MultipleSearchResults)
|
40
|
+
raise "Expecting Eco::API::Organization::People::MultipleSearchResults. Given: #{error.class}"
|
41
|
+
end
|
42
|
+
@_with_each_prompts = 0 unless instance_variable_defined?(:@_with_each_prompts)
|
43
|
+
@_with_each_prompts += 1 if increase_count
|
44
|
+
|
45
|
+
lines = []
|
46
|
+
lines << "\n(#{@_with_each_prompts}) " + error.to_s + "\n"
|
47
|
+
lines << " #index - Select the correct person by its number index among the list above."
|
48
|
+
lines << " (I) - Just Skip/Ignore this one. I will deal with that input entry in another launch."
|
49
|
+
lines << " (A) - Ignore all the rest of input entries with this problem."
|
50
|
+
lines << " (C) - Create a new person."
|
51
|
+
lines << " (B) - Just break this script. I need to change the input file :/"
|
52
|
+
|
53
|
+
prompt_user("Type one option (#number/I/A/C/B):", explanation: lines.join("\n"), default: "I") do |res|
|
54
|
+
res = res.upcase
|
55
|
+
case
|
56
|
+
when res.start_with?("I")
|
57
|
+
logger.info "Ignoring entry... #{entry.to_s(:identify) if entry}"
|
58
|
+
nil
|
59
|
+
when res.start_with?("A")
|
60
|
+
logger.info "All input entries with this same issue will be ignored for this launch"
|
61
|
+
@_skip_all_multiple_results = true
|
62
|
+
nil
|
63
|
+
when res.start_with?("C")
|
64
|
+
logger.info "Creating new person...#{"for entry #{entry.to_s(:identify)}" if entry}"
|
65
|
+
session.new_person
|
66
|
+
when res.start_with?("B")
|
67
|
+
raise error
|
68
|
+
when res && !res.empty? && (pos = res.to_i rescue nil) && (pos < error.candidates.length)
|
69
|
+
error.candidate(pos).tap do |person|
|
70
|
+
logger.info "Thanks!! You selected #{person.identify}"
|
71
|
+
sleep(1.5)
|
72
|
+
end
|
73
|
+
else
|
74
|
+
if pos.is_a?(Numeric) && (pos >= error.candidates.length)
|
75
|
+
print "#{pos} is not a number in the range. "
|
76
|
+
else
|
77
|
+
print "#{res} is not an option. "
|
78
|
+
end
|
79
|
+
puts "Please select one of the offered options..."
|
80
|
+
sleep(1)
|
81
|
+
_with_each_prompt_to_select_user(error, increase_count: false, entry: entry)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
25
86
|
end
|
26
87
|
end
|
27
88
|
end
|
@@ -15,8 +15,10 @@ module Eco
|
|
15
15
|
def with_each_present(entries, people, options, log_starter: false)
|
16
16
|
found = []
|
17
17
|
micro.with_each(entries, people, options) do |entry, person|
|
18
|
-
if person.new?
|
19
|
-
|
18
|
+
if person.new?
|
19
|
+
if log_starter
|
20
|
+
session.logger.error("This person does not exist: #{entry.to_s(:identify)}")
|
21
|
+
end
|
20
22
|
next
|
21
23
|
end
|
22
24
|
found << person
|
@@ -15,8 +15,10 @@ module Eco
|
|
15
15
|
def with_each_starter(entries, people, options, log_present: false)
|
16
16
|
starters = []
|
17
17
|
micro.with_each(entries, people, options) do |entry, person|
|
18
|
-
if !person.new?
|
19
|
-
|
18
|
+
if !person.new?
|
19
|
+
if log_present
|
20
|
+
session.logger.error("This person (id: '#{person.id}') already exists: #{entry.to_s(:identify)}")
|
21
|
+
end
|
20
22
|
next
|
21
23
|
end
|
22
24
|
starters << person
|
data/lib/eco/api/organization.rb
CHANGED
@@ -9,7 +9,7 @@ require_relative 'organization/tag_tree'
|
|
9
9
|
require_relative 'organization/presets_factory'
|
10
10
|
require_relative 'organization/preferences'
|
11
11
|
require_relative 'organization/people'
|
12
|
-
require_relative 'organization/
|
12
|
+
require_relative 'organization/people_similarity'
|
13
13
|
require_relative 'organization/person_schemas'
|
14
14
|
require_relative 'organization/policy_groups'
|
15
15
|
require_relative 'organization/login_providers'
|
@@ -2,6 +2,43 @@ module Eco
|
|
2
2
|
module API
|
3
3
|
module Organization
|
4
4
|
class People < Eco::Language::Models::Collection
|
5
|
+
# Error class that allows to handle cases where multiple people were found for the same criterion.
|
6
|
+
# @note its main purpose to prevent the creation of duplicates or override information between different people.
|
7
|
+
class MultipleSearchResults < StandardError
|
8
|
+
attr_reader :candidates, :property
|
9
|
+
# @param msg [String] the basic message error.
|
10
|
+
# @param candiates [Array<Person>] the people that match the same search criterion.
|
11
|
+
# @param property [String] the property of the person model that triggered the error (base of the search criterion).
|
12
|
+
def initialize(msg, candidates: [], property: "email")
|
13
|
+
@candidates = candidates
|
14
|
+
@property = property
|
15
|
+
super(msg + " " + candidates_summary)
|
16
|
+
end
|
17
|
+
|
18
|
+
# @param with_index [Boolean] to add an index to each candidate description.
|
19
|
+
# @return [Array<String>] the `candidates` identified
|
20
|
+
def identify_candidates(with_index: false)
|
21
|
+
candidates.map.each_with_index do |person, i|
|
22
|
+
index = with_index ? "#{i}. " : ""
|
23
|
+
msg = person.account ? (person.account_added? ? "(new user)" : "(user)") : "(no account)"
|
24
|
+
"#{index}#{msg} #{person.identify}"
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# @return [Person] the `candidate` in the `index` position
|
29
|
+
def candidate(index)
|
30
|
+
candidates[index]
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def candidates_summary
|
36
|
+
lines = ["The following people have the same '#{property}':"]
|
37
|
+
lines.concat(identify_candidates(with_index: true)).join("\n ")
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
5
42
|
# build the shortcuts of Collection
|
6
43
|
attr_presence :account, :details
|
7
44
|
attr_collection :id, :external_id, :email, :name, :supervisor_id
|
@@ -78,34 +115,36 @@ module Eco
|
|
78
115
|
# @!group Searchers
|
79
116
|
|
80
117
|
# It searches a person using the parameters given.
|
118
|
+
# @note This is how the search function actually works:
|
119
|
+
# 1. if eP `id` is given, returns the person (if found), otherwise...
|
120
|
+
# 2. if `external_id` is given, returns the person (if found), otherwise...
|
121
|
+
# 3. if `strict` is `false` and `email` is given:
|
122
|
+
# - if there is only 1 person with that email, returns that person, otherwise...
|
123
|
+
# - if found but, there are many candidates, it raises MultipleSearchResults error
|
124
|
+
# - if person `external_id` matches `email`, returns that person
|
125
|
+
# @raise MultipleSearchResults if there are multiple people with the same `email`
|
126
|
+
# and there's no other criteria to find the person. It only gets to this point if
|
127
|
+
# `external_id` was **not** provided and we are **not** in 'strict' search mode.
|
128
|
+
# However, it could be we were in `strict` mode and `external_id` was not provided.
|
81
129
|
# @param id [String] the `internal id` of the person
|
82
130
|
# @param external_id [String] the `exernal_id` of the person
|
83
131
|
# @param email [String] the `email` of the person
|
84
|
-
# @param strict [Boolean] if should perform a
|
132
|
+
# @param strict [Boolean] if should perform a `:soft` or a `:strict` search. `strict` will avoid repeated email addresses.
|
85
133
|
# @return [Person, nil] the person we were searching, or `nil` if not found.
|
86
134
|
def person(id: nil, external_id: nil, email: nil, strict: false)
|
87
135
|
init_caches
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
if !pers && !strict && !email.to_s.strip.empty?
|
98
|
-
candidates = @by_non_users_email[email&.downcase.strip] || []
|
99
|
-
raise "Too many non-user candidates (#{candidates.length}) with email '#{email}'" if candidates.length > 1
|
100
|
-
pers = candidates.first
|
101
|
-
end
|
102
|
-
|
103
|
-
pers = @by_external_id[email&.downcase.strip]&.first if !pers && !email.to_s.strip.empty?
|
104
|
-
end
|
105
|
-
|
136
|
+
# normalize values
|
137
|
+
ext_id = !external_id.to_s.strip.empty? && external_id.strip
|
138
|
+
email = !email.to_s.strip.empty? && email.downcase.strip
|
139
|
+
|
140
|
+
pers = nil
|
141
|
+
pers ||= @by_id[id]&.first
|
142
|
+
pers ||= @by_external_id[ext_id]&.first
|
143
|
+
pers ||= person_by_email(email) unless strict && ext_id
|
106
144
|
pers
|
107
145
|
end
|
108
146
|
|
147
|
+
# @see Eco::API::Organization::People#person
|
109
148
|
def find(object, strict: false)
|
110
149
|
id = attr_value(object, "id")
|
111
150
|
external_id = attr_value(object, "external_id")
|
@@ -177,8 +216,8 @@ module Eco
|
|
177
216
|
# @!endgroup
|
178
217
|
|
179
218
|
# @!group Helper methods
|
180
|
-
def
|
181
|
-
Eco::API::Organization::
|
219
|
+
def similarity
|
220
|
+
Eco::API::Organization::PeopleSimilarity.new(self.to_a)
|
182
221
|
end
|
183
222
|
# @!endgroup
|
184
223
|
|
@@ -190,16 +229,46 @@ module Eco
|
|
190
229
|
|
191
230
|
private
|
192
231
|
|
232
|
+
def person_by_email(email, prevent_duplicates: true)
|
233
|
+
return nil unless email
|
234
|
+
|
235
|
+
candidates = @by_non_users_email[email] || []
|
236
|
+
email_users = @by_users_email[email] || []
|
237
|
+
|
238
|
+
if pers = email_users.first
|
239
|
+
return pers if candidates.empty?
|
240
|
+
candidates = [pers] + candidates
|
241
|
+
elsif candidates.length == 1
|
242
|
+
return candidates.first
|
243
|
+
end
|
244
|
+
|
245
|
+
if prevent_duplicates && !candidates.empty?
|
246
|
+
msg = "Multiple search results match the criteria."
|
247
|
+
raise MultipleSearchResults.new(msg, candidates: candidates, property: "email")
|
248
|
+
end
|
249
|
+
|
250
|
+
@by_external_id[email]&.first
|
251
|
+
end
|
252
|
+
|
193
253
|
def init_caches
|
194
254
|
return if @caches_init
|
195
255
|
@by_id = to_h
|
196
|
-
@by_external_id = to_h('external_id')
|
197
|
-
@by_users_email =
|
198
|
-
@by_non_users_email = non_users.to_h('email')
|
199
|
-
@by_email = to_h('email')
|
256
|
+
@by_external_id = no_nil_key(to_h('external_id'))
|
257
|
+
@by_users_email = no_nil_key(existing_users.to_h('email'))
|
258
|
+
@by_non_users_email = no_nil_key(non_users.to_h('email'))
|
259
|
+
@by_email = no_nil_key(to_h('email'))
|
200
260
|
@caches_init = true
|
201
261
|
end
|
202
262
|
|
263
|
+
def existing_users
|
264
|
+
newFrom users.select {|u| !u.account_added?(:original)}
|
265
|
+
end
|
266
|
+
|
267
|
+
def no_nil_key(hash)
|
268
|
+
hash.tap {|h| h.delete(nil)}
|
269
|
+
end
|
270
|
+
|
271
|
+
|
203
272
|
end
|
204
273
|
end
|
205
274
|
end
|
@@ -0,0 +1,272 @@
|
|
1
|
+
module Eco
|
2
|
+
module API
|
3
|
+
module Organization
|
4
|
+
|
5
|
+
# Class to find out duplicates in the People Manager
|
6
|
+
#
|
7
|
+
# @attr_writer attribute [String, Proc, nil] the target attribute to be read.
|
8
|
+
class PeopleSimilarity < Eco::API::Organization::People
|
9
|
+
include Eco::Data::FuzzyMatch
|
10
|
+
|
11
|
+
attr_accessor :attribute
|
12
|
+
|
13
|
+
# @!group Config
|
14
|
+
# @return [String, Proc, nil] the target attribute to be read.
|
15
|
+
def attribute=(attr)
|
16
|
+
@attribute = attr
|
17
|
+
end
|
18
|
+
|
19
|
+
def attribute
|
20
|
+
@attribute ||= :name
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns the target value to analyse
|
24
|
+
# @param person [Ecoportal::API::V1::Person]
|
25
|
+
def item_value(person)
|
26
|
+
return attr.call(item) if attribute.is_a?(Proc)
|
27
|
+
attr = attribute.to_sym
|
28
|
+
return item.send(attr) if item.respond_to?(attr)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Define the order or relevant of per user matches
|
32
|
+
# @param values[Array<Symbol>] the algorithms' results it should be ordered by
|
33
|
+
# * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`
|
34
|
+
def order=(values)
|
35
|
+
@order = values
|
36
|
+
end
|
37
|
+
|
38
|
+
def order
|
39
|
+
@order ||= [:words_ngrams, :dice]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Define the order or relevant of per user matches
|
43
|
+
# @param value [Float] the threshold that all of the algorithms should comply with
|
44
|
+
def threshold=(value)
|
45
|
+
@threshold = value
|
46
|
+
end
|
47
|
+
|
48
|
+
def threshold
|
49
|
+
@threshold ||= 0.15
|
50
|
+
end
|
51
|
+
|
52
|
+
# Generates a new object with same config but different base `data`.
|
53
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
54
|
+
def newFrom(data)
|
55
|
+
super(data).tap do |simil|
|
56
|
+
simil.threshold = threshold
|
57
|
+
simil.order = order
|
58
|
+
simil.attribute = attribute
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# @!endgroup
|
63
|
+
|
64
|
+
# @!group Searchers
|
65
|
+
|
66
|
+
# It gathers those that have the same `email`
|
67
|
+
# @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
|
68
|
+
def repeated_emails
|
69
|
+
init_caches
|
70
|
+
@by_email.select do |email, people|
|
71
|
+
people.count > 1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# It returns all people with no name
|
76
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
77
|
+
def unnamed
|
78
|
+
select do |person|
|
79
|
+
person.name.to_s.strip.length < 2
|
80
|
+
end.yield_self do |results|
|
81
|
+
newFrom(results)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# It returns all people with no name
|
86
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
87
|
+
def named
|
88
|
+
reject do |person|
|
89
|
+
person.name.to_s.strip.length < 2
|
90
|
+
end.yield_self do |results|
|
91
|
+
newFrom(results)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# It returns all the entries with `attribute` empty
|
96
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
97
|
+
def blank_attribute
|
98
|
+
select do |person|
|
99
|
+
item_value(person).to_s.strip.length < 2
|
100
|
+
end.yield_self do |results|
|
101
|
+
newFrom(results)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
# It returns all the entries with `attribute` **n0t** empty
|
106
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
107
|
+
def attribute_present
|
108
|
+
reject do |person|
|
109
|
+
item_value(person).to_s.strip.length < 2
|
110
|
+
end.yield_self do |results|
|
111
|
+
newFrom(results)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# @!endgroup
|
116
|
+
|
117
|
+
# @!group Analisys starters
|
118
|
+
|
119
|
+
# Analyses People bases on `options`
|
120
|
+
# @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
|
121
|
+
# This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
|
122
|
+
# @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold)
|
123
|
+
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
124
|
+
def analyse(needle_read: nil, keep_empty: false, **options)
|
125
|
+
options = { read: self.attribute }.merge(options)
|
126
|
+
total = count; i = 1
|
127
|
+
each_with_object({}) do |person, results|
|
128
|
+
needle_str = needle_read ? item_string(person, needle_read) : nil
|
129
|
+
results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
|
130
|
+
print_progress("Analysed", total, i)
|
131
|
+
i += 1
|
132
|
+
end.yield_self do |analysed|
|
133
|
+
analysed = clean_empty(analysed) unless keep_empty
|
134
|
+
#puts "... #{analysed.count} results after cleaning empty"
|
135
|
+
analysed
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# @!endgroup
|
140
|
+
|
141
|
+
# @!group Results Treatment
|
142
|
+
|
143
|
+
# Gets a new instance object of this class, with only people in results
|
144
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
145
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
146
|
+
def newSimilarity(analysed)
|
147
|
+
newFrom(people_in_results(analysed))
|
148
|
+
end
|
149
|
+
|
150
|
+
def people_in_results(analysed)
|
151
|
+
analysed.each_with_object([]) do |(id, results), people|
|
152
|
+
related = results.each_with_object([self[id]]) do |result, related|
|
153
|
+
related << result.match
|
154
|
+
end
|
155
|
+
related.each {|person| people << person unless people.include?(person)}
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Removes from results those that do not have similar entries
|
160
|
+
def clean_empty(analysed)
|
161
|
+
analysed.select do |id, results|
|
162
|
+
!results.empty?
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# Helper to do some treatment fo the results
|
167
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
168
|
+
# @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
169
|
+
def with_analysed(analysed, keep_empty: false)
|
170
|
+
analysed.each_with_object({}) do |(id, results), reanalysed|
|
171
|
+
reanalysed[id] = yield(self[id], results)
|
172
|
+
end.yield_self do |reanalysed|
|
173
|
+
reanalysed = clean_empty(reanalysed) unless keep_empty
|
174
|
+
reanalysed
|
175
|
+
end.tap {|out| "with_analysed... returns #{out.count} records"}
|
176
|
+
end
|
177
|
+
|
178
|
+
# Launches a reanalyis on `analysed` based on `options`
|
179
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
180
|
+
def rearrange(analysed, **options)
|
181
|
+
with_analysed(analysed) do |person, results|
|
182
|
+
results.relevant_results(**options)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# Reanalyses by using a block to treat the needle and item values
|
187
|
+
def reanalyse(analysed, msg: "Reanalysing", **options, &block)
|
188
|
+
options = { read: self.attribute }.merge(options)
|
189
|
+
total = analysed.count; i = 1
|
190
|
+
with_analysed(analysed) do |person, results|
|
191
|
+
print_progress(msg, total, i)
|
192
|
+
i += 1
|
193
|
+
recalculate_results(results, &block)
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
# Renalyses by ignoring matching words between the `needle` and those found in `results`
|
198
|
+
def ignore_matching_words(analysed, **options)
|
199
|
+
prompt = "Reanalysing by ignoring matching words"
|
200
|
+
reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
|
201
|
+
self.class.remove_matching_words(needle_str, item_str)
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
# Renalyses by ignoring matching words between the `needle` and those found in `results`
|
206
|
+
def ignore_matching_words_old(analysed, **options)
|
207
|
+
options = { read: self.attribute }.merge(options)
|
208
|
+
total = analysed.count; i = 1
|
209
|
+
with_analysed(analysed) do |person, results|
|
210
|
+
print_progress("Reanalysing by ignoring matching words", total, i)
|
211
|
+
i += 1
|
212
|
+
ignore_same_words_score(results, **options)
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
# @!endgroup
|
217
|
+
|
218
|
+
# @!group Reporting Helpers
|
219
|
+
|
220
|
+
# @return [String] well structured text
|
221
|
+
def report(analysed, format: :txt)
|
222
|
+
case
|
223
|
+
when format == :txt
|
224
|
+
analysed.each_with_object("") do |(id, results), out|
|
225
|
+
msg = results.results.map {|r| r.print}.join("\n ")
|
226
|
+
out << "#{self[id].identify}:\n " + msg + "\n"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# @note
|
232
|
+
# 1. Unless `:analysed` is provided, it launches an analysis cutting with Jaro Winker min 0.5
|
233
|
+
# 2. It then re-sorts and cuts based on `options`
|
234
|
+
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
235
|
+
def print_analysis(**options)
|
236
|
+
analysed = options[:analysed] || results_with_false_positives.analyse(**options)
|
237
|
+
analysed.each_with_object({}) do |(id, results), out|
|
238
|
+
puts report(analysed)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
# @!endgroup
|
242
|
+
|
243
|
+
protected
|
244
|
+
|
245
|
+
def on_change
|
246
|
+
remove_instance_variable(@fuzzy_match)
|
247
|
+
super
|
248
|
+
end
|
249
|
+
|
250
|
+
private
|
251
|
+
|
252
|
+
def print_progress(msg, total, num)
|
253
|
+
return unless total > 10
|
254
|
+
puts "" unless num > 1
|
255
|
+
@print_msg_len ||= 0
|
256
|
+
percent = (100 * num.to_f / total).round(1)
|
257
|
+
msg = " #{msg}: #{percent}% (#{num} of #{total})\r"
|
258
|
+
@print_msg_len = msg.length unless @print_msg_len > msg.length
|
259
|
+
print msg
|
260
|
+
$stdout.flush
|
261
|
+
if percent > 99.9
|
262
|
+
sleep(0.2)
|
263
|
+
print "#{" " * @print_msg_len}\r"
|
264
|
+
$stdout.flush
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
|
269
|
+
end
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|