eco-helpers 2.0.18 → 2.0.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/lib/eco/api/common/people/entry_factory.rb +26 -9
- data/lib/eco/api/common/people/person_entry.rb +1 -0
- data/lib/eco/api/common/session.rb +1 -0
- data/lib/eco/api/common/session/base_session.rb +2 -0
- data/lib/eco/api/common/session/helpers.rb +30 -0
- data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
- data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
- data/lib/eco/api/microcases/with_each.rb +67 -6
- data/lib/eco/api/microcases/with_each_present.rb +4 -2
- data/lib/eco/api/microcases/with_each_starter.rb +4 -2
- data/lib/eco/api/organization.rb +1 -1
- data/lib/eco/api/organization/people.rb +92 -23
- data/lib/eco/api/organization/people_similarity.rb +112 -0
- data/lib/eco/api/organization/person_schemas.rb +5 -1
- data/lib/eco/api/organization/policy_groups.rb +5 -1
- data/lib/eco/api/session.rb +5 -2
- data/lib/eco/api/session/batch.rb +7 -5
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +12 -35
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +81 -36
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +3 -4
- data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
- data/lib/eco/cli/config/default/options.rb +2 -1
- data/lib/eco/cli/config/default/usecases.rb +2 -0
- data/lib/eco/cli/config/default/workflow.rb +4 -1
- data/lib/eco/csv.rb +4 -2
- data/lib/eco/data/fuzzy_match.rb +63 -21
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +7 -2
- data/lib/eco/data/fuzzy_match/pairing.rb +0 -1
- data/lib/eco/data/fuzzy_match/result.rb +7 -1
- data/lib/eco/data/fuzzy_match/results.rb +12 -6
- data/lib/eco/version.rb +1 -1
- metadata +4 -2
- data/lib/eco/api/organization/people_analytics.rb +0 -60
@@ -35,7 +35,8 @@ ASSETS.cli.config do |cnf|
|
|
35
35
|
session.schema = sch_id
|
36
36
|
end
|
37
37
|
|
38
|
-
desc
|
38
|
+
desc = "Used to be used to specify the input file when using -get-partial. "
|
39
|
+
desc += "It can also be useful to obtain `-get-partial` of people base on `:export` use cases (i.e. -people-to-csv)"
|
39
40
|
options_set.add("-entries-from", desc) do |options, session|
|
40
41
|
options.deep_merge!(input: {entries_from: true})
|
41
42
|
end
|
@@ -48,6 +48,8 @@ ASSETS.cli.config do |cnf|
|
|
48
48
|
options.deep_merge!(export: {options: {detailed: true}})
|
49
49
|
end.add_option("-permissions-custom", "Used with -detailed. Adds the permissions_custom abilities") do |options|
|
50
50
|
options.deep_merge!(export: {options: {permissions_custom: true}})
|
51
|
+
end.add_option("-split-schemas", "It will generate 1 file per each schema") do |options|
|
52
|
+
options.deep_merge!(export: {options: {split_schemas: true}})
|
51
53
|
end
|
52
54
|
|
53
55
|
desc = "Adds a column 'ecoPortalTag' to the input CSV with the tags that the location codes map to"
|
@@ -28,7 +28,10 @@ ASSETS.cli.config do |config|
|
|
28
28
|
cases_with_input = config.usecases.active(io: io).select do |usecase, data|
|
29
29
|
io.class.input_required?(usecase.type)
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
|
+
input_is_required = !cases_with_input.empty? || io.options.dig(:input, :entries_from)
|
33
|
+
missing_input = !io.input || io.input.empty?
|
34
|
+
next io unless missing_input && input_is_required
|
32
35
|
|
33
36
|
if io.options.dig(:input, :entries_from)
|
34
37
|
io = io.new(input: config.input.get(io: io))
|
data/lib/eco/csv.rb
CHANGED
@@ -18,8 +18,10 @@ module Eco
|
|
18
18
|
kargs = {headers: true, skip_blanks: true}.merge(kargs)
|
19
19
|
|
20
20
|
args = [file].tap do |arg|
|
21
|
-
|
22
|
-
|
21
|
+
encoding = Eco::API::Common::Session::FileManager.encoding(file)
|
22
|
+
#encoding = (encoding != "utf-8")? "#{encoding}|utf-8": encoding
|
23
|
+
#arg.push(encoding)
|
24
|
+
arg.push("rb:bom|utf-8") if encoding == "bom"
|
23
25
|
end
|
24
26
|
|
25
27
|
out = super(*args, **kargs).reject do |row|
|
data/lib/eco/data/fuzzy_match.rb
CHANGED
@@ -27,17 +27,28 @@ module Eco
|
|
27
27
|
include CharsPositionScore
|
28
28
|
include NGramsScore
|
29
29
|
|
30
|
-
def jaro_winkler(str1, str2)
|
30
|
+
def jaro_winkler(str1, str2, **options)
|
31
31
|
options = {
|
32
32
|
ignore_case: true,
|
33
33
|
weight: 0.25
|
34
|
-
}
|
34
|
+
}.merge(options)
|
35
35
|
JaroWinkler.distance(str1, str2, **options)
|
36
36
|
end
|
37
37
|
|
38
38
|
end
|
39
39
|
|
40
40
|
module InstanceMethods
|
41
|
+
FUZZY_MATCH_OPTIONS = [
|
42
|
+
:identities, :groupings, :stop_words, :read,
|
43
|
+
:must_match_grouping, :must_match_at_least_one_word,
|
44
|
+
:gather_last_result, :threshold
|
45
|
+
]
|
46
|
+
|
47
|
+
JARO_OPTIONS = [:ignore_case, :weight]
|
48
|
+
NGRAMS_OPTIONS = [:range]
|
49
|
+
POSITION_OPTIONS = [:max_distance]
|
50
|
+
RESULTS_OPTIONS = [:order, :threshold]
|
51
|
+
|
41
52
|
include StopWords
|
42
53
|
|
43
54
|
attr_accessor :fuzzy_options
|
@@ -46,19 +57,14 @@ module Eco
|
|
46
57
|
@fuzzy_options ||= {}
|
47
58
|
end
|
48
59
|
|
49
|
-
def fuzzy_match(
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
60
|
+
def fuzzy_match(haystack_data = nil, **options)
|
61
|
+
if instance_variable_defined?(:@fuzzy_match) && !haystack_data
|
62
|
+
return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
|
63
|
+
end
|
64
|
+
@fuzzy_options = options
|
54
65
|
# make it run with a native C extension (for better performance: ~130 % increase of performance)
|
55
66
|
::FuzzyMatch.engine = :amatch
|
56
|
-
|
57
|
-
if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
|
58
|
-
raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
@fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
|
67
|
+
@fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
|
62
68
|
end
|
63
69
|
|
64
70
|
# @note
|
@@ -71,37 +77,73 @@ module Eco
|
|
71
77
|
unless item == needle
|
72
78
|
needle_str = item_string(needle)
|
73
79
|
item_str = item_string(item)
|
74
|
-
jaro_res =
|
75
|
-
ngram_res =
|
76
|
-
wngram_res =
|
77
|
-
pos_res =
|
80
|
+
jaro_res = jaro(needle_str, item_str)
|
81
|
+
ngram_res = ngram(needle_str, item_str)
|
82
|
+
wngram_res = words_ngram(needle_str, item_str)
|
83
|
+
pos_res = position(needle_str, item_str)
|
84
|
+
|
78
85
|
results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
|
79
86
|
end
|
80
87
|
end
|
81
|
-
Results.new(needle, item_string(needle), results)
|
88
|
+
Results.new(needle, item_string(needle), results).tap do |res|
|
89
|
+
res.order = fuzzy_options[:order] if fuzzy_options[:order]
|
90
|
+
res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
|
91
|
+
end
|
82
92
|
end
|
83
93
|
|
84
94
|
private
|
85
95
|
|
96
|
+
def jaro(str1, str2)
|
97
|
+
options = fuzzy_options.slice(*JARO_OPTIONS)
|
98
|
+
self.class.jaro_winkler(str1, str2, **options)
|
99
|
+
end
|
100
|
+
|
101
|
+
def ngram(str1, str2)
|
102
|
+
options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
|
103
|
+
self.class.ngrams_score(str1, str2, **options).ratio
|
104
|
+
end
|
105
|
+
|
106
|
+
def words_ngram(str1, str2)
|
107
|
+
options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
|
108
|
+
self.class.words_ngrams_score(str1, str2, **options).ratio
|
109
|
+
end
|
110
|
+
|
111
|
+
def position(str1, str2)
|
112
|
+
options = fuzzy_options.slice(*POSITION_OPTIONS)
|
113
|
+
self.class.chars_position_score(str1, str2, **options).ratio
|
114
|
+
end
|
115
|
+
|
86
116
|
# @note
|
87
117
|
# - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
|
88
118
|
# @param data [Enumerable, nil]
|
89
119
|
# @return [Array<Object>] the non-repeated values of `data`
|
90
|
-
def
|
120
|
+
def haystack(data = nil)
|
91
121
|
data = self if self.is_a?(Enumerable) && !data
|
92
122
|
raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
|
93
123
|
data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
|
94
|
-
data.uniq.compact
|
124
|
+
data.uniq.compact.tap do |items|
|
125
|
+
if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
|
126
|
+
raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
|
127
|
+
end
|
128
|
+
end
|
95
129
|
end
|
96
130
|
|
97
131
|
def item_string(item, attr = fuzzy_read_method)
|
98
132
|
return item if !item || item.is_a?(String) || !attr
|
133
|
+
return attr.call(item) if attr.is_a?(Proc)
|
99
134
|
attr = attr.to_sym
|
100
135
|
return item.send(attr) if item.respond_to?(attr)
|
101
136
|
end
|
102
137
|
|
138
|
+
def fuzzy_match_options(options = nil)
|
139
|
+
options = fuzzy_options unless options
|
140
|
+
options.slice(*FUZZY_MATCH_OPTIONS).merge({
|
141
|
+
stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
|
142
|
+
})
|
143
|
+
end
|
144
|
+
|
103
145
|
def fuzzy_read_method
|
104
|
-
|
146
|
+
fuzzy_match_options[:read]
|
105
147
|
end
|
106
148
|
|
107
149
|
end
|
@@ -16,8 +16,13 @@ module Eco
|
|
16
16
|
|
17
17
|
Score.new(0, 0).tap do |score|
|
18
18
|
next if !str2 || !str1
|
19
|
-
|
20
|
-
|
19
|
+
if str1 == str2
|
20
|
+
score.increase_total(len1)
|
21
|
+
score.increase(score.total)
|
22
|
+
end
|
23
|
+
if str1.length < 2 || str1.length < 2
|
24
|
+
score.increase_total(len1)
|
25
|
+
end
|
21
26
|
|
22
27
|
paired_words(str1, str2, normalized: true) do |needle, item|
|
23
28
|
ngrams_score(needle, item, range: range, normalized: true)
|
@@ -27,7 +27,6 @@ module Eco
|
|
27
27
|
def paired_words(str1, str2, format: [:pair, :score], normalized: false)
|
28
28
|
str1, str2 = normalize_string([str1, str2]) unless normalized
|
29
29
|
return {} if !str2 || !str1
|
30
|
-
return score.increase(score.total) if str1 == str2
|
31
30
|
return {str1 => nil} if str1.length < 2 || str1.length < 2
|
32
31
|
|
33
32
|
needles = get_words(str1, normalized: true)
|
@@ -11,12 +11,18 @@ module Eco
|
|
11
11
|
def words_ngrams; super&.round(3); end
|
12
12
|
def chars_position; super&.round(3); end
|
13
13
|
|
14
|
+
def average
|
15
|
+
values = [dice, levenshtein, jaro_winkler, ngrams, words_ngrams, chars_position]
|
16
|
+
(values.inject(0.0, :+) / values.length).round(3)
|
17
|
+
end
|
18
|
+
|
14
19
|
# TODO: print in the order of `order`
|
15
20
|
def print
|
16
21
|
msg = "(Dice: #{dice}) (Lev Dst: #{levenshtein}) "
|
17
22
|
msg << "(Jaro: #{jaro_winkler}) "
|
18
23
|
msg << "(Ngram: #{ngrams}) (WNgrams: #{words_ngrams}) "
|
19
24
|
msg << "(C Pos: #{chars_position}) "
|
25
|
+
msg << "(Avg: #{average}) "
|
20
26
|
msg << "'#{value}'"
|
21
27
|
end
|
22
28
|
|
@@ -37,7 +43,7 @@ module Eco
|
|
37
43
|
|
38
44
|
def order=(values)
|
39
45
|
@order = [values].flatten.compact.tap do |o|
|
40
|
-
o
|
46
|
+
o << [:words_ngrams, :dice] if o.empty?
|
41
47
|
end
|
42
48
|
end
|
43
49
|
|
@@ -3,23 +3,29 @@ module Eco
|
|
3
3
|
module FuzzyMatch
|
4
4
|
class Results < Struct.new(:needle, :value, :raw_results)
|
5
5
|
|
6
|
+
attr_accessor :threshold
|
7
|
+
|
6
8
|
def results_with_false_positives
|
7
|
-
relevant_results(
|
9
|
+
relevant_results(order: :jaro_winkler, threshold: 0.5)
|
8
10
|
end
|
9
11
|
|
10
|
-
def relevant_results(
|
12
|
+
def relevant_results(**options)
|
13
|
+
options = {order: order, threshold: threshold || 0.5}.merge(options)
|
11
14
|
raw_results.select do |result|
|
12
|
-
result.all_threshold?(
|
15
|
+
result.all_threshold?(options[:order], options[:threshold])
|
13
16
|
end.yield_self do |filtered|
|
14
17
|
self.class.new(needle, value, filtered).tap do |results|
|
15
|
-
results.order =
|
18
|
+
results.order = options[:order]
|
16
19
|
end
|
17
20
|
end
|
18
21
|
end
|
19
22
|
|
23
|
+
# @param values[Array<Symbol>] the algorithms' results it should be ordered by
|
24
|
+
# * Possible values: `:dice`, `:levenshtein`, `:jaro_winkler`, `:ngrams`, `:words_ngrams`, `:chars_position`, `:average`
|
20
25
|
def order=(values)
|
21
|
-
@order = [values].flatten.compact
|
22
|
-
|
26
|
+
@order = [values].flatten.compact.tap do |o|
|
27
|
+
raw_results.each {|r| r.order = o}
|
28
|
+
end
|
23
29
|
end
|
24
30
|
|
25
31
|
def order
|
data/lib/eco/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eco-helpers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Oscar Segura
|
@@ -345,6 +345,8 @@ files:
|
|
345
345
|
- lib/eco/api/common/session/base_session.rb
|
346
346
|
- lib/eco/api/common/session/environment.rb
|
347
347
|
- lib/eco/api/common/session/file_manager.rb
|
348
|
+
- lib/eco/api/common/session/helpers.rb
|
349
|
+
- lib/eco/api/common/session/helpers/prompt_user.rb
|
348
350
|
- lib/eco/api/common/session/logger.rb
|
349
351
|
- lib/eco/api/common/session/logger/cache.rb
|
350
352
|
- lib/eco/api/common/session/logger/log.rb
|
@@ -397,7 +399,7 @@ files:
|
|
397
399
|
- lib/eco/api/organization.rb
|
398
400
|
- lib/eco/api/organization/login_providers.rb
|
399
401
|
- lib/eco/api/organization/people.rb
|
400
|
-
- lib/eco/api/organization/
|
402
|
+
- lib/eco/api/organization/people_similarity.rb
|
401
403
|
- lib/eco/api/organization/person_schemas.rb
|
402
404
|
- lib/eco/api/organization/policy_groups.rb
|
403
405
|
- lib/eco/api/organization/preferences.rb
|
@@ -1,60 +0,0 @@
|
|
1
|
-
module Eco
|
2
|
-
module API
|
3
|
-
module Organization
|
4
|
-
class PeopleAnalytics < Eco::API::Organization::People
|
5
|
-
include Eco::Data::FuzzyMatch
|
6
|
-
|
7
|
-
# @!group Helpers
|
8
|
-
|
9
|
-
# @!endgroup
|
10
|
-
|
11
|
-
# @!group Searchers
|
12
|
-
|
13
|
-
# It gathers those that have the same `email`
|
14
|
-
# @return [Hash] where `keys` are `email`s and `values` an `Array<Person>`
|
15
|
-
def repeated_emails
|
16
|
-
init_caches
|
17
|
-
@by_email.select do |email, people|
|
18
|
-
people.count > 1
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
# @!endgroup
|
23
|
-
|
24
|
-
# @!group Analysers
|
25
|
-
|
26
|
-
# TODO: Sort results by `results.first.methods`
|
27
|
-
def similarity(**options)
|
28
|
-
each_with_object({}) do |person, results|
|
29
|
-
results[person.id] = find_all_with_score(person, **options)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
def print_analysis(threshold)
|
35
|
-
similarity.each do |id, results|
|
36
|
-
msg = results.results.select do |result|
|
37
|
-
result.threshold?(threshold)
|
38
|
-
end.map do |result|
|
39
|
-
result.print
|
40
|
-
end.join("\n ")
|
41
|
-
|
42
|
-
puts "'#{self[id].identify}':\n " + msg
|
43
|
-
end
|
44
|
-
end
|
45
|
-
# @!endgroup
|
46
|
-
|
47
|
-
protected
|
48
|
-
|
49
|
-
def on_change
|
50
|
-
remove_instance_variable(@fuzzy_match)
|
51
|
-
super
|
52
|
-
end
|
53
|
-
|
54
|
-
private
|
55
|
-
|
56
|
-
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|