eco-helpers 2.0.15 → 2.0.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +109 -3
- data/eco-helpers.gemspec +11 -5
- data/lib/eco-helpers.rb +2 -0
- data/lib/eco/api/common/base_loader.rb +14 -0
- data/lib/eco/api/common/loaders/parser.rb +1 -0
- data/lib/eco/api/common/people/default_parsers/date_parser.rb +11 -1
- data/lib/eco/api/common/people/default_parsers/login_providers_parser.rb +1 -1
- data/lib/eco/api/common/people/default_parsers/policy_groups_parser.rb +11 -11
- data/lib/eco/api/common/people/entries.rb +1 -0
- data/lib/eco/api/common/people/entry_factory.rb +74 -23
- data/lib/eco/api/common/people/person_entry.rb +5 -2
- data/lib/eco/api/common/people/supervisor_helpers.rb +27 -0
- data/lib/eco/api/common/session.rb +1 -0
- data/lib/eco/api/common/session/base_session.rb +2 -0
- data/lib/eco/api/common/session/file_manager.rb +2 -2
- data/lib/eco/api/common/session/helpers.rb +30 -0
- data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
- data/lib/eco/api/common/session/mailer.rb +0 -1
- data/lib/eco/api/common/session/s3_uploader.rb +0 -1
- data/lib/eco/api/common/session/sftp.rb +0 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
- data/lib/eco/api/common/version_patches/exception.rb +11 -4
- data/lib/eco/api/microcases.rb +3 -1
- data/lib/eco/api/microcases/append_usergroups.rb +0 -1
- data/lib/eco/api/microcases/people_cache.rb +2 -2
- data/lib/eco/api/microcases/people_load.rb +2 -2
- data/lib/eco/api/microcases/people_refresh.rb +2 -2
- data/lib/eco/api/microcases/people_search.rb +6 -6
- data/lib/eco/api/microcases/preserve_default_tag.rb +23 -0
- data/lib/eco/api/microcases/preserve_filter_tags.rb +28 -0
- data/lib/eco/api/microcases/preserve_policy_groups.rb +30 -0
- data/lib/eco/api/microcases/set_account.rb +0 -1
- data/lib/eco/api/microcases/with_each.rb +67 -6
- data/lib/eco/api/microcases/with_each_present.rb +4 -2
- data/lib/eco/api/microcases/with_each_starter.rb +4 -2
- data/lib/eco/api/organization.rb +1 -0
- data/lib/eco/api/organization/people.rb +98 -22
- data/lib/eco/api/organization/people_similarity.rb +272 -0
- data/lib/eco/api/organization/person_schemas.rb +5 -1
- data/lib/eco/api/organization/policy_groups.rb +5 -1
- data/lib/eco/api/organization/presets_factory.rb +40 -80
- data/lib/eco/api/organization/presets_integrity.json +6 -0
- data/lib/eco/api/organization/presets_values.json +5 -4
- data/lib/eco/api/organization/tag_tree.rb +33 -0
- data/lib/eco/api/policies/default_policies/99_user_access_policy.rb +0 -30
- data/lib/eco/api/session.rb +10 -24
- data/lib/eco/api/session/batch.rb +25 -7
- data/lib/eco/api/session/config.rb +16 -15
- data/lib/eco/api/session/config/api.rb +4 -0
- data/lib/eco/api/session/config/apis.rb +80 -0
- data/lib/eco/api/session/config/files.rb +7 -0
- data/lib/eco/api/session/config/people.rb +3 -19
- data/lib/eco/api/usecases/default_cases.rb +4 -1
- data/lib/eco/api/usecases/default_cases/abstract_policygroup_abilities_case.rb +161 -0
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +223 -0
- data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
- data/lib/eco/api/usecases/default_cases/codes_to_tags_case.rb +2 -3
- data/lib/eco/api/usecases/default_cases/reset_landing_page_case.rb +11 -1
- data/lib/eco/api/usecases/default_cases/restore_db_case.rb +1 -2
- data/lib/eco/api/usecases/default_cases/supers_cyclic_identify_case.rb +72 -0
- data/lib/eco/api/usecases/default_cases/supers_hierarchy_case.rb +1 -1
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +132 -29
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +61 -36
- data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
- data/lib/eco/cli.rb +0 -10
- data/lib/eco/cli/config/default/options.rb +48 -17
- data/lib/eco/cli/config/default/people.rb +18 -24
- data/lib/eco/cli/config/default/people_filters.rb +3 -3
- data/lib/eco/cli/config/default/usecases.rb +105 -28
- data/lib/eco/cli/config/default/workflow.rb +21 -12
- data/lib/eco/cli/config/help.rb +1 -0
- data/lib/eco/cli/config/options_set.rb +106 -13
- data/lib/eco/cli/config/use_cases.rb +33 -33
- data/lib/eco/cli/scripting/args_helpers.rb +30 -3
- data/lib/eco/csv.rb +4 -2
- data/lib/eco/csv/table.rb +121 -21
- data/lib/eco/data.rb +1 -0
- data/lib/eco/data/crypto/encryption.rb +3 -3
- data/lib/eco/data/files/directory.rb +28 -20
- data/lib/eco/data/files/helpers.rb +6 -4
- data/lib/eco/data/fuzzy_match.rb +201 -0
- data/lib/eco/data/fuzzy_match/array_helpers.rb +75 -0
- data/lib/eco/data/fuzzy_match/chars_position_score.rb +38 -0
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +82 -0
- data/lib/eco/data/fuzzy_match/pairing.rb +95 -0
- data/lib/eco/data/fuzzy_match/result.rb +87 -0
- data/lib/eco/data/fuzzy_match/results.rb +77 -0
- data/lib/eco/data/fuzzy_match/score.rb +49 -0
- data/lib/eco/data/fuzzy_match/stop_words.rb +35 -0
- data/lib/eco/data/fuzzy_match/string_helpers.rb +82 -0
- data/lib/eco/version.rb +1 -1
- metadata +168 -11
- data/lib/eco/api/microcases/refresh_abilities.rb +0 -19
- data/lib/eco/api/organization/presets_reference.json +0 -59
- data/lib/eco/api/usecases/default_cases/refresh_abilities_case.rb +0 -30
|
@@ -15,8 +15,10 @@ module Eco
|
|
|
15
15
|
def with_each_present(entries, people, options, log_starter: false)
|
|
16
16
|
found = []
|
|
17
17
|
micro.with_each(entries, people, options) do |entry, person|
|
|
18
|
-
if person.new?
|
|
19
|
-
|
|
18
|
+
if person.new?
|
|
19
|
+
if log_starter
|
|
20
|
+
session.logger.error("This person does not exist: #{entry.to_s(:identify)}")
|
|
21
|
+
end
|
|
20
22
|
next
|
|
21
23
|
end
|
|
22
24
|
found << person
|
|
@@ -15,8 +15,10 @@ module Eco
|
|
|
15
15
|
def with_each_starter(entries, people, options, log_present: false)
|
|
16
16
|
starters = []
|
|
17
17
|
micro.with_each(entries, people, options) do |entry, person|
|
|
18
|
-
if !person.new?
|
|
19
|
-
|
|
18
|
+
if !person.new?
|
|
19
|
+
if log_present
|
|
20
|
+
session.logger.error("This person (id: '#{person.id}') already exists: #{entry.to_s(:identify)}")
|
|
21
|
+
end
|
|
20
22
|
next
|
|
21
23
|
end
|
|
22
24
|
starters << person
|
data/lib/eco/api/organization.rb
CHANGED
|
@@ -9,6 +9,7 @@ require_relative 'organization/tag_tree'
|
|
|
9
9
|
require_relative 'organization/presets_factory'
|
|
10
10
|
require_relative 'organization/preferences'
|
|
11
11
|
require_relative 'organization/people'
|
|
12
|
+
require_relative 'organization/people_similarity'
|
|
12
13
|
require_relative 'organization/person_schemas'
|
|
13
14
|
require_relative 'organization/policy_groups'
|
|
14
15
|
require_relative 'organization/login_providers'
|
|
@@ -2,6 +2,43 @@ module Eco
|
|
|
2
2
|
module API
|
|
3
3
|
module Organization
|
|
4
4
|
class People < Eco::Language::Models::Collection
|
|
5
|
+
# Error class that allows to handle cases where multiple people were found for the same criterion.
|
|
6
|
+
# @note its main purpose to prevent the creation of duplicates or override information between different people.
|
|
7
|
+
class MultipleSearchResults < StandardError
|
|
8
|
+
attr_reader :candidates, :property
|
|
9
|
+
# @param msg [String] the basic message error.
|
|
10
|
+
# @param candiates [Array<Person>] the people that match the same search criterion.
|
|
11
|
+
# @param property [String] the property of the person model that triggered the error (base of the search criterion).
|
|
12
|
+
def initialize(msg, candidates: [], property: "email")
|
|
13
|
+
@candidates = candidates
|
|
14
|
+
@property = property
|
|
15
|
+
super(msg + " " + candidates_summary)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# @param with_index [Boolean] to add an index to each candidate description.
|
|
19
|
+
# @return [Array<String>] the `candidates` identified
|
|
20
|
+
def identify_candidates(with_index: false)
|
|
21
|
+
candidates.map.each_with_index do |person, i|
|
|
22
|
+
index = with_index ? "#{i}. " : ""
|
|
23
|
+
msg = person.account ? (person.account_added? ? "(new user)" : "(user)") : "(no account)"
|
|
24
|
+
"#{index}#{msg} #{person.identify}"
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# @return [Person] the `candidate` in the `index` position
|
|
29
|
+
def candidate(index)
|
|
30
|
+
candidates[index]
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
private
|
|
34
|
+
|
|
35
|
+
def candidates_summary
|
|
36
|
+
lines = ["The following people have the same '#{property}':"]
|
|
37
|
+
lines.concat(identify_candidates(with_index: true)).join("\n ")
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
end
|
|
41
|
+
|
|
5
42
|
# build the shortcuts of Collection
|
|
6
43
|
attr_presence :account, :details
|
|
7
44
|
attr_collection :id, :external_id, :email, :name, :supervisor_id
|
|
@@ -78,34 +115,36 @@ module Eco
|
|
|
78
115
|
# @!group Searchers
|
|
79
116
|
|
|
80
117
|
# It searches a person using the parameters given.
|
|
118
|
+
# @note This is how the search function actually works:
|
|
119
|
+
# 1. if eP `id` is given, returns the person (if found), otherwise...
|
|
120
|
+
# 2. if `external_id` is given, returns the person (if found), otherwise...
|
|
121
|
+
# 3. if `strict` is `false` and `email` is given:
|
|
122
|
+
# - if there is only 1 person with that email, returns that person, otherwise...
|
|
123
|
+
# - if found but, there are many candidates, it raises MultipleSearchResults error
|
|
124
|
+
# - if person `external_id` matches `email`, returns that person
|
|
125
|
+
# @raise MultipleSearchResults if there are multiple people with the same `email`
|
|
126
|
+
# and there's no other criteria to find the person. It only gets to this point if
|
|
127
|
+
# `external_id` was **not** provided and we are **not** in 'strict' search mode.
|
|
128
|
+
# However, it could be we were in `strict` mode and `external_id` was not provided.
|
|
81
129
|
# @param id [String] the `internal id` of the person
|
|
82
130
|
# @param external_id [String] the `exernal_id` of the person
|
|
83
131
|
# @param email [String] the `email` of the person
|
|
84
|
-
# @param strict [Boolean] if should perform a
|
|
132
|
+
# @param strict [Boolean] if should perform a `:soft` or a `:strict` search. `strict` will avoid repeated email addresses.
|
|
85
133
|
# @return [Person, nil] the person we were searching, or `nil` if not found.
|
|
86
134
|
def person(id: nil, external_id: nil, email: nil, strict: false)
|
|
87
135
|
init_caches
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if !pers && !strict && !email.to_s.strip.empty?
|
|
98
|
-
candidates = @by_non_users_email[email&.downcase.strip] || []
|
|
99
|
-
raise "Too many non-user candidates (#{candidates.length}) with email '#{email}'" if candidates.length > 1
|
|
100
|
-
pers = candidates.first
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
pers = @by_external_id[email&.downcase.strip]&.first if !pers && !email.to_s.strip.empty?
|
|
104
|
-
end
|
|
105
|
-
|
|
136
|
+
# normalize values
|
|
137
|
+
ext_id = !external_id.to_s.strip.empty? && external_id.strip
|
|
138
|
+
email = !email.to_s.strip.empty? && email.downcase.strip
|
|
139
|
+
|
|
140
|
+
pers = nil
|
|
141
|
+
pers ||= @by_id[id]&.first
|
|
142
|
+
pers ||= @by_external_id[ext_id]&.first
|
|
143
|
+
pers ||= person_by_email(email) unless strict && ext_id
|
|
106
144
|
pers
|
|
107
145
|
end
|
|
108
146
|
|
|
147
|
+
# @see Eco::API::Organization::People#person
|
|
109
148
|
def find(object, strict: false)
|
|
110
149
|
id = attr_value(object, "id")
|
|
111
150
|
external_id = attr_value(object, "external_id")
|
|
@@ -176,6 +215,12 @@ module Eco
|
|
|
176
215
|
end
|
|
177
216
|
# @!endgroup
|
|
178
217
|
|
|
218
|
+
# @!group Helper methods
|
|
219
|
+
def similarity
|
|
220
|
+
Eco::API::Organization::PeopleSimilarity.new(self.to_a)
|
|
221
|
+
end
|
|
222
|
+
# @!endgroup
|
|
223
|
+
|
|
179
224
|
protected
|
|
180
225
|
|
|
181
226
|
def on_change
|
|
@@ -184,15 +229,46 @@ module Eco
|
|
|
184
229
|
|
|
185
230
|
private
|
|
186
231
|
|
|
232
|
+
def person_by_email(email, prevent_duplicates: true)
|
|
233
|
+
return nil unless email
|
|
234
|
+
|
|
235
|
+
candidates = @by_non_users_email[email] || []
|
|
236
|
+
email_users = @by_users_email[email] || []
|
|
237
|
+
|
|
238
|
+
if pers = email_users.first
|
|
239
|
+
return pers if candidates.empty?
|
|
240
|
+
candidates = [pers] + candidates
|
|
241
|
+
elsif candidates.length == 1
|
|
242
|
+
return candidates.first
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
if prevent_duplicates && !candidates.empty?
|
|
246
|
+
msg = "Multiple search results match the criteria."
|
|
247
|
+
raise MultipleSearchResults.new(msg, candidates: candidates, property: "email")
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
@by_external_id[email]&.first
|
|
251
|
+
end
|
|
252
|
+
|
|
187
253
|
def init_caches
|
|
188
254
|
return if @caches_init
|
|
189
255
|
@by_id = to_h
|
|
190
|
-
@by_external_id = to_h('external_id')
|
|
191
|
-
@by_users_email =
|
|
192
|
-
@by_non_users_email = non_users.to_h('email')
|
|
256
|
+
@by_external_id = no_nil_key(to_h('external_id'))
|
|
257
|
+
@by_users_email = no_nil_key(existing_users.to_h('email'))
|
|
258
|
+
@by_non_users_email = no_nil_key(non_users.to_h('email'))
|
|
259
|
+
@by_email = no_nil_key(to_h('email'))
|
|
193
260
|
@caches_init = true
|
|
194
261
|
end
|
|
195
262
|
|
|
263
|
+
def existing_users
|
|
264
|
+
newFrom users.select {|u| !u.account_added?(:original)}
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
def no_nil_key(hash)
|
|
268
|
+
hash.tap {|h| h.delete(nil)}
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
|
|
196
272
|
end
|
|
197
273
|
end
|
|
198
274
|
end
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
module Eco
|
|
2
|
+
module API
|
|
3
|
+
module Organization
|
|
4
|
+
|
|
5
|
+
# Class to find out duplicates in the People Manager
|
|
6
|
+
#
|
|
7
|
+
# @attr_writer attribute [String, Proc, nil] the target attribute to be read.
|
|
8
|
+
class PeopleSimilarity < Eco::API::Organization::People
|
|
9
|
+
include Eco::Data::FuzzyMatch
|
|
10
|
+
|
|
11
|
+
attr_accessor :attribute
|
|
12
|
+
|
|
13
|
+
# @!group Config
|
|
14
|
+
# @return [String, Proc, nil] the target attribute to be read.
|
|
15
|
+
def attribute=(attr)
|
|
16
|
+
@attribute = attr
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def attribute
|
|
20
|
+
@attribute ||= :name
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Returns the target value to analyse
|
|
24
|
+
# @param person [Ecoportal::API::V1::Person]
|
|
25
|
+
def item_value(person)
|
|
26
|
+
return attr.call(item) if attribute.is_a?(Proc)
|
|
27
|
+
attr = attribute.to_sym
|
|
28
|
+
return item.send(attr) if item.respond_to?(attr)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Define the order or relevant of per user matches
|
|
32
|
+
# @param values[Array<Symbol>] the algorithms' results it should be ordered by
|
|
33
|
+
# * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`
|
|
34
|
+
def order=(values)
|
|
35
|
+
@order = values
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def order
|
|
39
|
+
@order ||= [:words_ngrams, :dice]
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Define the order or relevant of per user matches
|
|
43
|
+
# @param value [Float] the threshold that all of the algorithms should comply with
|
|
44
|
+
def threshold=(value)
|
|
45
|
+
@threshold = value
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def threshold
|
|
49
|
+
@threshold ||= 0.15
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Generates a new object with same config but different base `data`.
|
|
53
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
|
54
|
+
def newFrom(data)
|
|
55
|
+
super(data).tap do |simil|
|
|
56
|
+
simil.threshold = threshold
|
|
57
|
+
simil.order = order
|
|
58
|
+
simil.attribute = attribute
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# @!endgroup
|
|
63
|
+
|
|
64
|
+
# @!group Searchers
|
|
65
|
+
|
|
66
|
+
# It gathers those that have the same `email`
|
|
67
|
+
# @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
|
|
68
|
+
def repeated_emails
|
|
69
|
+
init_caches
|
|
70
|
+
@by_email.select do |email, people|
|
|
71
|
+
people.count > 1
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# It returns all people with no name
|
|
76
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
|
77
|
+
def unnamed
|
|
78
|
+
select do |person|
|
|
79
|
+
person.name.to_s.strip.length < 2
|
|
80
|
+
end.yield_self do |results|
|
|
81
|
+
newFrom(results)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# It returns all people with no name
|
|
86
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
|
87
|
+
def named
|
|
88
|
+
reject do |person|
|
|
89
|
+
person.name.to_s.strip.length < 2
|
|
90
|
+
end.yield_self do |results|
|
|
91
|
+
newFrom(results)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# It returns all the entries with `attribute` empty
|
|
96
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
|
97
|
+
def blank_attribute
|
|
98
|
+
select do |person|
|
|
99
|
+
item_value(person).to_s.strip.length < 2
|
|
100
|
+
end.yield_self do |results|
|
|
101
|
+
newFrom(results)
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# It returns all the entries with `attribute` **n0t** empty
|
|
106
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
|
107
|
+
def attribute_present
|
|
108
|
+
reject do |person|
|
|
109
|
+
item_value(person).to_s.strip.length < 2
|
|
110
|
+
end.yield_self do |results|
|
|
111
|
+
newFrom(results)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# @!endgroup
|
|
116
|
+
|
|
117
|
+
# @!group Analisys starters
|
|
118
|
+
|
|
119
|
+
# Analyses People bases on `options`
|
|
120
|
+
# @param needle_read [Proc, Symbol] when the value to read from `needle` object is different to the `:read` (`attribute`).
|
|
121
|
+
# This allows to for example, facet `needle.name` (needle_read) against `haystack_item.details[alt_id]` (read).
|
|
122
|
+
# @param keep_empty [Boolean] to indicate if it should get rid of people with no results (based on threshold)
|
|
123
|
+
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
|
124
|
+
def analyse(needle_read: nil, keep_empty: false, **options)
|
|
125
|
+
options = { read: self.attribute }.merge(options)
|
|
126
|
+
total = count; i = 1
|
|
127
|
+
each_with_object({}) do |person, results|
|
|
128
|
+
needle_str = needle_read ? item_string(person, needle_read) : nil
|
|
129
|
+
results[person.id] = find_all_with_score(person, needle_str: needle_str, **options)
|
|
130
|
+
print_progress("Analysed", total, i)
|
|
131
|
+
i += 1
|
|
132
|
+
end.yield_self do |analysed|
|
|
133
|
+
analysed = clean_empty(analysed) unless keep_empty
|
|
134
|
+
#puts "... #{analysed.count} results after cleaning empty"
|
|
135
|
+
analysed
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# @!endgroup
|
|
140
|
+
|
|
141
|
+
# @!group Results Treatment
|
|
142
|
+
|
|
143
|
+
# Gets a new instance object of this class, with only people in results
|
|
144
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
|
145
|
+
# @return [Eco::API::Organization::PeopleSimilarity]
|
|
146
|
+
def newSimilarity(analysed)
|
|
147
|
+
newFrom(people_in_results(analysed))
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def people_in_results(analysed)
|
|
151
|
+
analysed.each_with_object([]) do |(id, results), people|
|
|
152
|
+
related = results.each_with_object([self[id]]) do |result, related|
|
|
153
|
+
related << result.match
|
|
154
|
+
end
|
|
155
|
+
related.each {|person| people << person unless people.include?(person)}
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Removes from results those that do not have similar entries
|
|
160
|
+
def clean_empty(analysed)
|
|
161
|
+
analysed.select do |id, results|
|
|
162
|
+
!results.empty?
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Helper to do some treatment fo the results
|
|
167
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
|
168
|
+
# @return [Hash] where the _keys_ are the people `id`s and _values_ the `Eco::Data::FuzzyMatch::Results`
|
|
169
|
+
def with_analysed(analysed, keep_empty: false)
|
|
170
|
+
analysed.each_with_object({}) do |(id, results), reanalysed|
|
|
171
|
+
reanalysed[id] = yield(self[id], results)
|
|
172
|
+
end.yield_self do |reanalysed|
|
|
173
|
+
reanalysed = clean_empty(reanalysed) unless keep_empty
|
|
174
|
+
reanalysed
|
|
175
|
+
end.tap {|out| "with_analysed... returns #{out.count} records"}
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Launches a reanalyis on `analysed` based on `options`
|
|
179
|
+
# @param analysed [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
|
180
|
+
def rearrange(analysed, **options)
|
|
181
|
+
with_analysed(analysed) do |person, results|
|
|
182
|
+
results.relevant_results(**options)
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Reanalyses by using a block to treat the needle and item values
|
|
187
|
+
def reanalyse(analysed, msg: "Reanalysing", **options, &block)
|
|
188
|
+
options = { read: self.attribute }.merge(options)
|
|
189
|
+
total = analysed.count; i = 1
|
|
190
|
+
with_analysed(analysed) do |person, results|
|
|
191
|
+
print_progress(msg, total, i)
|
|
192
|
+
i += 1
|
|
193
|
+
recalculate_results(results, &block)
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Renalyses by ignoring matching words between the `needle` and those found in `results`
|
|
198
|
+
def ignore_matching_words(analysed, **options)
|
|
199
|
+
prompt = "Reanalysing by ignoring matching words"
|
|
200
|
+
reanalyse(analysed, msg: prompt, **options) do |needle_str, item_str, needle, item|
|
|
201
|
+
self.class.remove_matching_words(needle_str, item_str)
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
# Renalyses by ignoring matching words between the `needle` and those found in `results`
|
|
206
|
+
def ignore_matching_words_old(analysed, **options)
|
|
207
|
+
options = { read: self.attribute }.merge(options)
|
|
208
|
+
total = analysed.count; i = 1
|
|
209
|
+
with_analysed(analysed) do |person, results|
|
|
210
|
+
print_progress("Reanalysing by ignoring matching words", total, i)
|
|
211
|
+
i += 1
|
|
212
|
+
ignore_same_words_score(results, **options)
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# @!endgroup
|
|
217
|
+
|
|
218
|
+
# @!group Reporting Helpers
|
|
219
|
+
|
|
220
|
+
# @return [String] well structured text
|
|
221
|
+
def report(analysed, format: :txt)
|
|
222
|
+
case
|
|
223
|
+
when format == :txt
|
|
224
|
+
analysed.each_with_object("") do |(id, results), out|
|
|
225
|
+
msg = results.results.map {|r| r.print}.join("\n ")
|
|
226
|
+
out << "#{self[id].identify}:\n " + msg + "\n"
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# @note
|
|
232
|
+
# 1. Unless `:analysed` is provided, it launches an analysis cutting with Jaro Winker min 0.5
|
|
233
|
+
# 2. It then re-sorts and cuts based on `options`
|
|
234
|
+
# @return [Hash] where the _keys_ are the people `id`s and the _values_ the `Eco::Data::FuzzyMatch::Results`
|
|
235
|
+
def print_analysis(**options)
|
|
236
|
+
analysed = options[:analysed] || results_with_false_positives.analyse(**options)
|
|
237
|
+
analysed.each_with_object({}) do |(id, results), out|
|
|
238
|
+
puts report(analysed)
|
|
239
|
+
end
|
|
240
|
+
end
|
|
241
|
+
# @!endgroup
|
|
242
|
+
|
|
243
|
+
protected
|
|
244
|
+
|
|
245
|
+
def on_change
|
|
246
|
+
remove_instance_variable(@fuzzy_match)
|
|
247
|
+
super
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
private
|
|
251
|
+
|
|
252
|
+
def print_progress(msg, total, num)
|
|
253
|
+
return unless total > 10
|
|
254
|
+
puts "" unless num > 1
|
|
255
|
+
@print_msg_len ||= 0
|
|
256
|
+
percent = (100 * num.to_f / total).round(1)
|
|
257
|
+
msg = " #{msg}: #{percent}% (#{num} of #{total})\r"
|
|
258
|
+
@print_msg_len = msg.length unless @print_msg_len > msg.length
|
|
259
|
+
print msg
|
|
260
|
+
$stdout.flush
|
|
261
|
+
if percent > 99.9
|
|
262
|
+
sleep(0.2)
|
|
263
|
+
print "#{" " * @print_msg_len}\r"
|
|
264
|
+
$stdout.flush
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
end
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
end
|