eco-helpers 2.0.18 → 2.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/lib/eco/api/common/people/entry_factory.rb +26 -9
- data/lib/eco/api/common/people/person_entry.rb +1 -0
- data/lib/eco/api/common/session.rb +1 -0
- data/lib/eco/api/common/session/base_session.rb +2 -0
- data/lib/eco/api/common/session/helpers.rb +30 -0
- data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
- data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
- data/lib/eco/api/microcases/with_each.rb +67 -6
- data/lib/eco/api/microcases/with_each_present.rb +4 -2
- data/lib/eco/api/microcases/with_each_starter.rb +4 -2
- data/lib/eco/api/organization.rb +1 -1
- data/lib/eco/api/organization/people.rb +92 -23
- data/lib/eco/api/organization/people_similarity.rb +112 -0
- data/lib/eco/api/organization/person_schemas.rb +5 -1
- data/lib/eco/api/organization/policy_groups.rb +5 -1
- data/lib/eco/api/session.rb +5 -2
- data/lib/eco/api/session/batch.rb +7 -5
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +12 -35
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
- data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
- data/lib/eco/cli/config/default/options.rb +2 -1
- data/lib/eco/cli/config/default/usecases.rb +2 -0
- data/lib/eco/cli/config/default/workflow.rb +4 -1
- data/lib/eco/csv.rb +4 -2
- data/lib/eco/data/fuzzy_match.rb +63 -21
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +7 -2
- data/lib/eco/data/fuzzy_match/pairing.rb +0 -1
- data/lib/eco/data/fuzzy_match/result.rb +7 -1
- data/lib/eco/data/fuzzy_match/results.rb +12 -6
- data/lib/eco/version.rb +1 -1
- metadata +4 -2
- data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -35,7 +35,8 @@ ASSETS.cli.config do |cnf|
|
|
35
35
|
session.schema = sch_id
|
36
36
|
end
|
37
37
|
|
38
|
-
desc
|
38
|
+
desc = "Used to be used to specify the input file when using -get-partial. "
|
39
|
+
desc += "It can also be useful to obtain `-get-partial` of people base on `:export` use cases (i.e. -people-to-csv)"
|
39
40
|
options_set.add("-entries-from", desc) do |options, session|
|
40
41
|
options.deep_merge!(input: {entries_from: true})
|
41
42
|
end
|
@@ -48,6 +48,8 @@ ASSETS.cli.config do |cnf|
|
|
48
48
|
options.deep_merge!(export: {options: {detailed: true}})
|
49
49
|
end.add_option("-permissions-custom", "Used with -detailed. Adds the permissions_custom abilities") do |options|
|
50
50
|
options.deep_merge!(export: {options: {permissions_custom: true}})
|
51
|
+
end.add_option("-split-schemas", "It will generate 1 file per each schema") do |options|
|
52
|
+
options.deep_merge!(export: {options: {split_schemas: true}})
|
51
53
|
end
|
52
54
|
|
53
55
|
desc = "Adds a column 'ecoPortalTag' to the input CSV with the tags that the location codes map to"
|
@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
|
|
28
28
|
cases_with_input = config.usecases.active(io: io).select do |usecase, data|
|
29
29
|
io.class.input_required?(usecase.type)
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
|
+
input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
|
33
|
+
missing_input = !io.input || io.input.empty?
|
34
|
+
next io unless missing_input && input_is_required
|
32
35
|
|
33
36
|
if io.options.dig(:input, :entries_from)
|
34
37
|
io = io.new(input: config.input.get(io: io))
|
data/lib/eco/csv.rb
CHANGED
@@ -18,8 +18,10 @@ module Eco
|
|
18
18
|
kargs = {headers: true, skip_blanks: true}.merge(kargs)
|
19
19
|
|
20
20
|
args = [file].tap do |arg|
|
21
|
-
|
22
|
-
|
21
|
+
encoding = Eco::API::Common::Session::FileManager.encoding(file)
|
22
|
+
#encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
|
23
|
+
#arg.push(encoding)
|
24
|
+
arg.push("rb:bom|utf-8") if encoding == "bom"
|
23
25
|
end
|
24
26
|
|
25
27
|
out = super(*args, **kargs).reject do |row|
|
data/lib/eco/data/fuzzy_match.rb
CHANGED
@@ -27,17 +27,28 @@ module Eco
|
|
27
27
|
include CharsPositionScore
|
28
28
|
include NGramsScore
|
29
29
|
|
30
|
-
def jaro_winkler(str1, str2)
|
30
|
+
def jaro_winkler(str1, str2, **options)
|
31
31
|
options = {
|
32
32
|
ignore_case: true,
|
33
33
|
weight: 0.25
|
34
|
-
}
|
34
|
+
}.merge(options)
|
35
35
|
JaroWinkler.distance(str1, str2, **options)
|
36
36
|
end
|
37
37
|
|
38
38
|
end
|
39
39
|
|
40
40
|
module InstanceMethods
|
41
|
+
FUZZY_MATCH_OPTIONS = [
|
42
|
+
:identities, :groupings, :stop_words, :read,
|
43
|
+
:must_match_grouping, :must_match_at_least_one_word,
|
44
|
+
:gather_last_result, :threshold
|
45
|
+
]
|
46
|
+
|
47
|
+
JARO_OPTIONS = [:ignore_case, :weight]
|
48
|
+
NGRAMS_OPTIONS = [:range]
|
49
|
+
POSITION_OPTIONS = [:max_distance]
|
50
|
+
RESULTS_OPTIONS = [:order, :threshold]
|
51
|
+
|
41
52
|
include StopWords
|
42
53
|
|
43
54
|
attr_accessor :fuzzy_options
|
@@ -46,19 +57,14 @@ module Eco
|
|
46
57
|
@fuzzy_options ||= {}
|
47
58
|
end
|
48
59
|
|
49
|
-
def fuzzy_match(
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
60
|
+
def fuzzy_match(haystack_data = nil, **options)
|
61
|
+
if instance_variable_defined?(:@fuzzy_match) && !haystack_data
|
62
|
+
return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
|
63
|
+
end
|
64
|
+
@fuzzy_options = options
|
54
65
|
# make it run with a native C extension (for better performance: ~130 % increase of performance)
|
55
66
|
::FuzzyMatch.engine = :amatch
|
56
|
-
|
57
|
-
if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
|
58
|
-
raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
@fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
|
67
|
+
@fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
|
62
68
|
end
|
63
69
|
|
64
70
|
# @note
|
@@ -71,37 +77,73 @@ module Eco
|
|
71
77
|
unless item == needle
|
72
78
|
needle_str = item_string(needle)
|
73
79
|
item_str = item_string(item)
|
74
|
-
jaro_res =
|
75
|
-
ngram_res =
|
76
|
-
wngram_res =
|
77
|
-
pos_res =
|
80
|
+
jaro_res = jaro(needle_str, item_str)
|
81
|
+
ngram_res = ngram(needle_str, item_str)
|
82
|
+
wngram_res = words_ngram(needle_str, item_str)
|
83
|
+
pos_res = position(needle_str, item_str)
|
84
|
+
|
78
85
|
results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
|
79
86
|
end
|
80
87
|
end
|
81
|
-
Results.new(needle, item_string(needle), results)
|
88
|
+
Results.new(needle, item_string(needle), results).tap do |res|
|
89
|
+
res.order = fuzzy_options[:order] if fuzzy_options[:order]
|
90
|
+
res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
|
91
|
+
end
|
82
92
|
end
|
83
93
|
|
84
94
|
private
|
85
95
|
|
96
|
+
def jaro(str1, str2)
|
97
|
+
options = fuzzy_options.slice(*JARO_OPTIONS)
|
98
|
+
self.class.jaro_winkler(str1, str2, **options)
|
99
|
+
end
|
100
|
+
|
101
|
+
def ngram(str1, str2)
|
102
|
+
options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
|
103
|
+
self.class.ngrams_score(str1, str2, **options).ratio
|
104
|
+
end
|
105
|
+
|
106
|
+
def words_ngram(str1, str2)
|
107
|
+
options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
|
108
|
+
self.class.words_ngrams_score(str1, str2, **options).ratio
|
109
|
+
end
|
110
|
+
|
111
|
+
def position(str1, str2)
|
112
|
+
options = fuzzy_options.slice(*POSITION_OPTIONS)
|
113
|
+
self.class.chars_position_score(str1, str2, **options).ratio
|
114
|
+
end
|
115
|
+
|
86
116
|
# @note
|
87
117
|
# - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
|
88
118
|
# @param data [Enumerable, nil]
|
89
119
|
# @return [Array<Object>] the non-repeated values of `data`
|
90
|
-
def
|
120
|
+
def haystack(data = nil)
|
91
121
|
data = self if self.is_a?(Enumerable) && !data
|
92
122
|
raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
|
93
123
|
data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
|
94
|
-
data.uniq.compact
|
124
|
+
data.uniq.compact.tap do |items|
|
125
|
+
if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
|
126
|
+
raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
|
127
|
+
end
|
128
|
+
end
|
95
129
|
end
|
96
130
|
|
97
131
|
def item_string(item, attr = fuzzy_read_method)
|
98
132
|
return item if !item || item.is_a?(String) || !attr
|
133
|
+
return attr.call(item) if attr.is_a?(Proc)
|
99
134
|
attr = attr.to_sym
|
100
135
|
return item.send(attr) if item.respond_to?(attr)
|
101
136
|
end
|
102
137
|
|
138
|
+
def fuzzy_match_options(options = nil)
|
139
|
+
options = fuzzy_options unless options
|
140
|
+
options.slice(*FUZZY_MATCH_OPTIONS).merge({
|
141
|
+
stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
|
142
|
+
})
|
143
|
+
end
|
144
|
+
|
103
145
|
def fuzzy_read_method
|
104
|
-
|
146
|
+
fuzzy_match_options[:read]
|
105
147
|
end
|
106
148
|
|
107
149
|
end
|
@@ -16,8 +16,13 @@ module Eco
|
|
16
16
|
|
17
17
|
Score.new(0, 0).tap do |score|
|
18
18
|
next if !str2 || !str1
|
19
|
-
|
20
|
-
|
19
|
+
if str1 == str2
|
20
|
+
score.increase_total(len1)
|
21
|
+
score.increase(score.total)
|
22
|
+
end
|
23
|
+
if str1.length < 2 || str1.length < 2
|
24
|
+
score.increase_total(len1)
|
25
|
+
end
|
21
26
|
|
22
27
|
paired_words(str1, str2, normalized: true) do |needle, item|
|
23
28
|
ngrams_score(needle, item, range: range, normalized: true)
|
@@ -27,7 +27,6 @@ module Eco
|
|
27
27
|
def paired_words(str1, str2, format: [:pair, :score], normalized: false)
|
28
28
|
str1, str2 = normalize_string([str1, str2]) unless normalized
|
29
29
|
return {} if !str2 || !str1
|
30
|
-
return score.increase(score.total) if str1 == str2
|
31
30
|
return {str1 => nil} if str1.length < 2 || str1.length < 2
|
32
31
|
|
33
32
|
needles = get_words(str1, normalized: true)
|
@@ -11,12 +11,18 @@ module Eco
|
|
11
11
|
def words_ngrams; super&.round(3); end
|
12
12
|
def chars_position; super&.round(3); end
|
13
13
|
|
14
|
+
def average
|
15
|
+
values = [dice, levenshtein, jaro_winkler, ngrams, words_ngrams, chars_position]
|
16
|
+
(values.inject(0.0, :+) / values.length).round(3)
|
17
|
+
end
|
18
|
+
|
14
19
|
# TODO: print in the order of `order`
|
15
20
|
def print
|
16
21
|
msg = "(Dice: #{dice}) (Lev Dst: #{levenshtein}) "
|
17
22
|
msg << "(Jaro: #{jaro_winkler}) "
|
18
23
|
msg << "(Ngram: #{ngrams}) (WNgrams: #{words_ngrams}) "
|
19
24
|
msg << "(C Pos: #{chars_position}) "
|
25
|
+
msg << "(Avg: #{average}) "
|
20
26
|
msg << "'#{value}'"
|
21
27
|
end
|
22
28
|
|
@@ -37,7 +43,7 @@ module Eco
|
|
37
43
|
|
38
44
|
def order=(values)
|
39
45
|
@order = [values].flatten.compact.tap do |o|
|
40
|
-
o
|
46
|
+
o << [:words_ngrams, :dice] if o.empty?
|
41
47
|
end
|
42
48
|
end
|
43
49
|
|
@@ -3,23 +3,29 @@ module Eco
|
|
3
3
|
module FuzzyMatch
|
4
4
|
class Results < Struct.new(:needle, :value, :raw_results)
|
5
5
|
|
6
|
+
attr_accessor :threshold
|
7
|
+
|
6
8
|
def results_with_false_positives
|
7
|
-
relevant_results(
|
9
|
+
relevant_results(order: :jaro_winkler, threshold: 0.5)
|
8
10
|
end
|
9
11
|
|
10
|
-
def relevant_results(
|
12
|
+
def relevant_results(**options)
|
13
|
+
options = {order: order, threshold: threshold || 0.5}.merge(options)
|
11
14
|
raw_results.select do |result|
|
12
|
-
result.all_threshold?(
|
15
|
+
result.all_threshold?(options[:order], options[:threshold])
|
13
16
|
end.yield_self do |filtered|
|
14
17
|
self.class.new(needle, value, filtered).tap do |results|
|
15
|
-
results.order =
|
18
|
+
results.order = options[:order]
|
16
19
|
end
|
17
20
|
end
|
18
21
|
end
|
19
22
|
|
23
|
+
# @param values[Array<Symbol>] the algorithms' results it should be ordered by
|
24
|
+
# * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`, `:average`
|
20
25
|
def order=(values)
|
21
|
-
@order = [values].flatten.compact
|
22
|
-
|
26
|
+
@order = [values].flatten.compact.tap do |o|
|
27
|
+
raw_results.each {|r| r.order = o}
|
28
|
+
end
|
23
29
|
end
|
24
30
|
|
25
31
|
def order
|
data/lib/eco/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eco-helpers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Oscar Segura
|
@@ -345,6 +345,8 @@ files:
|
|
345
345
|
- lib/eco/api/common/session/base_session.rb
|
346
346
|
- lib/eco/api/common/session/environment.rb
|
347
347
|
- lib/eco/api/common/session/file_manager.rb
|
348
|
+
- lib/eco/api/common/session/helpers.rb
|
349
|
+
- lib/eco/api/common/session/helpers/prompt_user.rb
|
348
350
|
- lib/eco/api/common/session/logger.rb
|
349
351
|
- lib/eco/api/common/session/logger/cache.rb
|
350
352
|
- lib/eco/api/common/session/logger/log.rb
|
@@ -397,7 +399,7 @@ files:
|
|
397
399
|
- lib/eco/api/organization.rb
|
398
400
|
- lib/eco/api/organization/login_providers.rb
|
399
401
|
- lib/eco/api/organization/people.rb
|
400
|
-
- lib/eco/api/organization/
|
402
|
+
- lib/eco/api/organization/people_similarity.rb
|
401
403
|
- lib/eco/api/organization/person_schemas.rb
|
402
404
|
- lib/eco/api/organization/policy_groups.rb
|
403
405
|
- lib/eco/api/organization/preferences.rb
|
@@ -1,60 +0,0 @@
|
|
1
|
-
module Eco
|
2
|
-
module API
|
3
|
-
module Organization
|
4
|
-
class PeopleAnalytics < Eco::API::Organization::People
|
5
|
-
include Eco::Data::FuzzyMatch
|
6
|
-
|
7
|
-
# @!group Helpers
|
8
|
-
|
9
|
-
# @!endgroup
|
10
|
-
|
11
|
-
# @!group Searchers
|
12
|
-
|
13
|
-
# It gathers those that have the same `email`
|
14
|
-
# @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
|
15
|
-
def repeated_emails
|
16
|
-
init_caches
|
17
|
-
@by_email.select do |email, people|
|
18
|
-
people.count > 1
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# @!endgroup
|
23
|
-
|
24
|
-
# @!group Analysers
|
25
|
-
|
26
|
-
# TODO: Sort results by `results.first.methods`
|
27
|
-
def similarity(**options)
|
28
|
-
each_with_object({}) do |person, results|
|
29
|
-
results[person.id] = find_all_with_score(person, **options)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
def print_analysis(threshold)
|
35
|
-
similarity.each do |id, results|
|
36
|
-
msg = results.results.select do |result|
|
37
|
-
result.threshold?(threshold)
|
38
|
-
end.map do |result|
|
39
|
-
result.print
|
40
|
-
end.join("\n ")
|
41
|
-
|
42
|
-
puts "'#{self[id].identify}':\n " + msg
|
43
|
-
end
|
44
|
-
end
|
45
|
-
# @!endgroup
|
46
|
-
|
47
|
-
protected
|
48
|
-
|
49
|
-
def on_change
|
50
|
-
remove_instance_variable(@fuzzy_match)
|
51
|
-
super
|
52
|
-
end
|
53
|
-
|
54
|
-
private
|
55
|
-
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|