eco-helpers 2.0.16 → 2.0.22
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +125 -6
- data/eco-helpers.gemspec +10 -5
- data/lib/eco-helpers.rb +2 -0
- data/lib/eco/api/common/base_loader.rb +18 -0
- data/lib/eco/api/common/loaders/parser.rb +1 -0
- data/lib/eco/api/common/people/default_parsers.rb +1 -0
- data/lib/eco/api/common/people/default_parsers/date_parser.rb +11 -1
- data/lib/eco/api/common/people/default_parsers/login_providers_parser.rb +1 -1
- data/lib/eco/api/common/people/default_parsers/policy_groups_parser.rb +11 -11
- data/lib/eco/api/common/people/default_parsers/xls_parser.rb +53 -0
- data/lib/eco/api/common/people/entries.rb +1 -0
- data/lib/eco/api/common/people/entry_factory.rb +88 -23
- data/lib/eco/api/common/people/person_entry.rb +5 -2
- data/lib/eco/api/common/people/person_parser.rb +1 -1
- data/lib/eco/api/common/session.rb +1 -0
- data/lib/eco/api/common/session/base_session.rb +2 -0
- data/lib/eco/api/common/session/helpers.rb +30 -0
- data/lib/eco/api/common/session/helpers/prompt_user.rb +34 -0
- data/lib/eco/api/common/session/mailer.rb +0 -1
- data/lib/eco/api/common/session/s3_uploader.rb +0 -1
- data/lib/eco/api/common/session/sftp.rb +0 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/external_person.rb +1 -1
- data/lib/eco/api/common/version_patches/ecoportal_api/internal_person.rb +7 -4
- data/lib/eco/api/common/version_patches/exception.rb +11 -4
- data/lib/eco/api/microcases.rb +3 -1
- data/lib/eco/api/microcases/append_usergroups.rb +0 -1
- data/lib/eco/api/microcases/people_cache.rb +2 -2
- data/lib/eco/api/microcases/people_load.rb +2 -2
- data/lib/eco/api/microcases/people_refresh.rb +2 -2
- data/lib/eco/api/microcases/people_search.rb +6 -6
- data/lib/eco/api/microcases/preserve_default_tag.rb +23 -0
- data/lib/eco/api/microcases/preserve_filter_tags.rb +28 -0
- data/lib/eco/api/microcases/preserve_policy_groups.rb +30 -0
- data/lib/eco/api/microcases/set_account.rb +0 -1
- data/lib/eco/api/microcases/with_each.rb +67 -6
- data/lib/eco/api/microcases/with_each_present.rb +4 -2
- data/lib/eco/api/microcases/with_each_starter.rb +4 -2
- data/lib/eco/api/organization.rb +1 -0
- data/lib/eco/api/organization/people.rb +98 -22
- data/lib/eco/api/organization/people_similarity.rb +272 -0
- data/lib/eco/api/organization/person_schemas.rb +5 -1
- data/lib/eco/api/organization/policy_groups.rb +5 -1
- data/lib/eco/api/organization/presets_factory.rb +22 -83
- data/lib/eco/api/organization/presets_integrity.json +6 -0
- data/lib/eco/api/organization/presets_values.json +5 -4
- data/lib/eco/api/organization/tag_tree.rb +33 -0
- data/lib/eco/api/policies/default_policies/99_user_access_policy.rb +0 -30
- data/lib/eco/api/session.rb +20 -28
- data/lib/eco/api/session/batch.rb +25 -7
- data/lib/eco/api/session/config.rb +0 -10
- data/lib/eco/api/session/config/apis.rb +80 -14
- data/lib/eco/api/session/config/people.rb +1 -17
- data/lib/eco/api/usecases.rb +2 -2
- data/lib/eco/api/usecases/base_case.rb +2 -2
- data/lib/eco/api/usecases/base_io.rb +17 -4
- data/lib/eco/api/usecases/default_cases.rb +2 -1
- data/lib/eco/api/usecases/default_cases/abstract_policygroup_abilities_case.rb +4 -4
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +223 -0
- data/lib/eco/api/usecases/default_cases/clean_unknown_tags_case.rb +37 -0
- data/lib/eco/api/usecases/default_cases/codes_to_tags_case.rb +2 -3
- data/lib/eco/api/usecases/default_cases/reset_landing_page_case.rb +11 -1
- data/lib/eco/api/usecases/default_cases/restore_db_case.rb +1 -2
- data/lib/eco/api/usecases/default_cases/supers_cyclic_identify_case.rb +1 -1
- data/lib/eco/api/usecases/default_cases/supers_hierarchy_case.rb +1 -1
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +132 -29
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +61 -36
- data/lib/eco/api/usecases/ooze_samples/ooze_update_case.rb +3 -2
- data/lib/eco/cli/config/default/input.rb +61 -8
- data/lib/eco/cli/config/default/options.rb +48 -17
- data/lib/eco/cli/config/default/people.rb +18 -24
- data/lib/eco/cli/config/default/people_filters.rb +3 -3
- data/lib/eco/cli/config/default/usecases.rb +97 -32
- data/lib/eco/cli/config/default/workflow.rb +22 -13
- data/lib/eco/cli/config/help.rb +1 -0
- data/lib/eco/cli/config/options_set.rb +106 -13
- data/lib/eco/cli/config/use_cases.rb +33 -33
- data/lib/eco/cli/scripting/args_helpers.rb +32 -5
- data/lib/eco/csv.rb +4 -2
- data/lib/eco/csv/table.rb +121 -21
- data/lib/eco/data.rb +1 -0
- data/lib/eco/data/crypto/encryption.rb +3 -3
- data/lib/eco/data/files/helpers.rb +6 -4
- data/lib/eco/data/fuzzy_match.rb +201 -0
- data/lib/eco/data/fuzzy_match/array_helpers.rb +75 -0
- data/lib/eco/data/fuzzy_match/chars_position_score.rb +38 -0
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +82 -0
- data/lib/eco/data/fuzzy_match/pairing.rb +95 -0
- data/lib/eco/data/fuzzy_match/result.rb +87 -0
- data/lib/eco/data/fuzzy_match/results.rb +77 -0
- data/lib/eco/data/fuzzy_match/score.rb +49 -0
- data/lib/eco/data/fuzzy_match/stop_words.rb +35 -0
- data/lib/eco/data/fuzzy_match/string_helpers.rb +82 -0
- data/lib/eco/version.rb +1 -1
- metadata +147 -11
- data/lib/eco/api/microcases/refresh_abilities.rb +0 -19
- data/lib/eco/api/organization/presets_reference.json +0 -59
- data/lib/eco/api/usecases/default_cases/refresh_abilities_case.rb +0 -30
data/lib/eco/data.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'openssl'
|
2
|
-
|
2
|
+
# 'json'
|
3
3
|
require 'base64'
|
4
|
-
require 'pp'
|
4
|
+
#require 'pp'
|
5
5
|
|
6
6
|
require_relative '../../cli/scripting'
|
7
7
|
|
@@ -147,7 +147,7 @@ module Eco
|
|
147
147
|
return str_c
|
148
148
|
#EncryptedData.new({content: str_c, key: key, iv: iv})
|
149
149
|
|
150
|
-
|
150
|
+
|
151
151
|
end
|
152
152
|
def aes256_decrypt(data, key: , iv: , block_octets: BLOCK_OCTETS)
|
153
153
|
block_bits = block_bits * 8
|
@@ -3,11 +3,13 @@ module Eco
|
|
3
3
|
module Files
|
4
4
|
DEFAULT_TIMESTAMP_PATTERN = '%Y-%m-%dT%H%M%S'
|
5
5
|
|
6
|
-
|
7
|
-
base
|
8
|
-
|
6
|
+
class << self
|
7
|
+
def included(base)
|
8
|
+
base.send(:include, InstanceMethods)
|
9
|
+
base.extend(ClassMethods)
|
10
|
+
end
|
9
11
|
end
|
10
|
-
|
12
|
+
|
11
13
|
module InstanceMethods
|
12
14
|
|
13
15
|
end
|
@@ -0,0 +1,201 @@
|
|
1
|
+
require 'fuzzy_match'
|
2
|
+
require 'amatch'
|
3
|
+
require 'jaro_winkler'
|
4
|
+
|
5
|
+
require_relative 'fuzzy_match/stop_words'
|
6
|
+
require_relative 'fuzzy_match/array_helpers'
|
7
|
+
require_relative 'fuzzy_match/string_helpers'
|
8
|
+
require_relative 'fuzzy_match/pairing'
|
9
|
+
require_relative 'fuzzy_match/chars_position_score'
|
10
|
+
require_relative 'fuzzy_match/ngrams_score'
|
11
|
+
|
12
|
+
module Eco
|
13
|
+
module Data
|
14
|
+
module FuzzyMatch
|
15
|
+
|
16
|
+
class << self
|
17
|
+
def included(base)
|
18
|
+
base.send(:include, InstanceMethods)
|
19
|
+
base.extend(ClassMethods)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module ClassMethods
|
24
|
+
include ArrayHelpers
|
25
|
+
include StringHelpers
|
26
|
+
include Pairing
|
27
|
+
include CharsPositionScore
|
28
|
+
include NGramsScore
|
29
|
+
|
30
|
+
def jaro_winkler(str1, str2, **options)
|
31
|
+
return 0 if !str1 || !str2
|
32
|
+
options = {
|
33
|
+
ignore_case: true,
|
34
|
+
weight: 0.25
|
35
|
+
}.merge(options)
|
36
|
+
JaroWinkler.distance(str1, str2, **options)
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
module InstanceMethods
|
42
|
+
FUZZY_MATCH_OPTIONS = [
|
43
|
+
:identities, :groupings, :stop_words, :read,
|
44
|
+
:must_match_grouping, :must_match_at_least_one_word,
|
45
|
+
:gather_last_result, :threshold
|
46
|
+
]
|
47
|
+
|
48
|
+
JARO_OPTIONS = [:ignore_case, :weight]
|
49
|
+
NGRAMS_OPTIONS = [:range]
|
50
|
+
POSITION_OPTIONS = [:max_distance]
|
51
|
+
RESULTS_OPTIONS = [:order, :threshold]
|
52
|
+
|
53
|
+
include StopWords
|
54
|
+
|
55
|
+
attr_accessor :fuzzy_options
|
56
|
+
|
57
|
+
def fuzzy_options
|
58
|
+
@fuzzy_options ||= {}
|
59
|
+
end
|
60
|
+
|
61
|
+
def fuzzy_match(haystack_data = nil, **options)
|
62
|
+
if instance_variable_defined?(:@fuzzy_match) && !haystack_data
|
63
|
+
return @fuzzy_match if fuzzy_match_options == fuzzy_match_options(options)
|
64
|
+
end
|
65
|
+
@fuzzy_options = options
|
66
|
+
# make it run with a native C extension (for better performance: ~130 % increase of performance)
|
67
|
+
::FuzzyMatch.engine = :amatch
|
68
|
+
@fuzzy_match = ::FuzzyMatch.new(haystack(haystack_data), fuzzy_match_options)
|
69
|
+
end
|
70
|
+
|
71
|
+
# TODO: integration for options[:unique_words] => to ensure repeated words do not bring down the score are cut by threshold
|
72
|
+
# @note
|
73
|
+
# - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
|
74
|
+
# @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key.
|
75
|
+
# @param needle_str [String, nil] the actual value of needle_str to be used.
|
76
|
+
# @param haystack [Enumerable] the items to find `needle` among.
|
77
|
+
# @return [Eco::Data::FuzzyMatch::Results]
|
78
|
+
def find_all_with_score(needle, needle_str: nil, haystack: nil, **options)
|
79
|
+
base_match = fuzzy_match(haystack, **options)
|
80
|
+
match_results = base_match.find_all_with_score(needle_str || needle)
|
81
|
+
needle_str ||= item_string(needle)
|
82
|
+
results = match_results.each_with_object([]) do |fuzzy_results, results|
|
83
|
+
item, dice, lev = fuzzy_results
|
84
|
+
unless item == needle
|
85
|
+
item_str = item_string(item)
|
86
|
+
|
87
|
+
if item_str.to_s.strip.empty? || needle_str.to_s.strip.empty?
|
88
|
+
dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
|
89
|
+
end
|
90
|
+
|
91
|
+
jaro_res ||= jaro(needle_str, item_str)
|
92
|
+
ngram_res ||= ngram(needle_str, item_str)
|
93
|
+
wngram_res ||= words_ngram(needle_str, item_str)
|
94
|
+
pos_res ||= position(needle_str, item_str)
|
95
|
+
|
96
|
+
results << Result.new(item, item_str, needle_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
|
97
|
+
end
|
98
|
+
end
|
99
|
+
Results.new(needle, needle_str, results).tap do |res|
|
100
|
+
res.order = fuzzy_options[:order] if fuzzy_options[:order]
|
101
|
+
res.threshold = fuzzy_options[:threshold] if fuzzy_options[:threshold]
|
102
|
+
end.relevant_results
|
103
|
+
end
|
104
|
+
|
105
|
+
def recalculate_results(results, needle_str: nil, **options)
|
106
|
+
raise "You should provide a block |needle_str, item_str, needle, item|" unless block_given?
|
107
|
+
new_results = results.each_with_object([]) do |result, new_results|
|
108
|
+
nstr, istr = yield(needle_str || results.value, result.value, results.needle, result.match)
|
109
|
+
|
110
|
+
if istr.to_s.strip.empty?
|
111
|
+
dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 1
|
112
|
+
elsif nstr.to_s.strip.empty?
|
113
|
+
unless istr = needle_str
|
114
|
+
dice = lev = jaro_res = ngram_res = ngram_res = wngram_res = pos_res = 0
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
res = ::FuzzyMatch.score_class.new(nstr, istr) unless dice && lev
|
119
|
+
dice ||= res&.dices_coefficient_similar || 0
|
120
|
+
lev ||= res&.levenshtein_similar || 0
|
121
|
+
jaro_res ||= jaro(nstr, istr)
|
122
|
+
ngram_res ||= ngram(nstr, istr)
|
123
|
+
wngram_res ||= words_ngram(nstr, istr)
|
124
|
+
pos_res ||= position(nstr, istr)
|
125
|
+
|
126
|
+
new_results << Result.new(*result.values_at(:match, :value, :needle_str), dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
|
127
|
+
end
|
128
|
+
Results.new(results.needle, results.value, new_results).tap do |res|
|
129
|
+
res.order = options[:order] if options[:order]
|
130
|
+
res.threshold = options[:threshold] if options[:threshold]
|
131
|
+
end.relevant_results
|
132
|
+
end
|
133
|
+
|
134
|
+
private
|
135
|
+
|
136
|
+
def jaro(str1, str2)
|
137
|
+
options = fuzzy_options.slice(*JARO_OPTIONS)
|
138
|
+
self.class.jaro_winkler(str1, str2, **options)
|
139
|
+
end
|
140
|
+
|
141
|
+
def ngram(str1, str2)
|
142
|
+
options = { range: 3..5 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
|
143
|
+
self.class.ngrams_score(str1, str2, **options).ratio
|
144
|
+
end
|
145
|
+
|
146
|
+
def words_ngram(str1, str2)
|
147
|
+
options = { range: 3..7 }.merge(fuzzy_options.slice(*NGRAMS_OPTIONS))
|
148
|
+
self.class.words_ngrams_score(str1, str2, **options).ratio
|
149
|
+
end
|
150
|
+
|
151
|
+
def position(str1, str2)
|
152
|
+
options = fuzzy_options.slice(*POSITION_OPTIONS)
|
153
|
+
self.class.chars_position_score(str1, str2, **options).ratio
|
154
|
+
end
|
155
|
+
|
156
|
+
# @note
|
157
|
+
# - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
|
158
|
+
# @param data [Enumerable, nil]
|
159
|
+
# @return [Array<Object>] the non-repeated values of `data`
|
160
|
+
def haystack(data = nil)
|
161
|
+
data = self if self.is_a?(Enumerable) && !data
|
162
|
+
raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
|
163
|
+
data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
|
164
|
+
data.uniq.compact.tap do |items|
|
165
|
+
if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
|
166
|
+
raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
def item_string(item, attr = fuzzy_read_method)
|
172
|
+
return item if !item || item.is_a?(String) || !attr
|
173
|
+
return attr.call(item) if attr.is_a?(Proc)
|
174
|
+
attr = attr.to_sym
|
175
|
+
return item.send(attr) if item.respond_to?(attr)
|
176
|
+
end
|
177
|
+
|
178
|
+
def fuzzy_match_options(options = nil)
|
179
|
+
options = fuzzy_options unless options
|
180
|
+
options.slice(*FUZZY_MATCH_OPTIONS).merge({
|
181
|
+
stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
|
182
|
+
})
|
183
|
+
end
|
184
|
+
|
185
|
+
def fuzzy_read_method
|
186
|
+
fuzzy_match_options[:read]
|
187
|
+
end
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
class << self
|
192
|
+
include FuzzyMatch::ClassMethods
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
require_relative 'fuzzy_match/score'
|
200
|
+
require_relative 'fuzzy_match/result'
|
201
|
+
require_relative 'fuzzy_match/results'
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module ArrayHelpers
|
5
|
+
# Keeps the start order of the `values` and consecutive `values` together/consecutive.
|
6
|
+
# @param values [Array] the input array with the values.
|
7
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
8
|
+
# @return [Array<Array<Value>>] combinations of `range` length of `values`.
|
9
|
+
def ngrams(values, range=2..3)
|
10
|
+
[].tap do |out|
|
11
|
+
if range.is_a?(Integer)
|
12
|
+
n = range
|
13
|
+
values_count = values.length
|
14
|
+
values.each_with_index do |word, i|
|
15
|
+
min = i
|
16
|
+
max = i + (n - 1)
|
17
|
+
break if values_count <= max
|
18
|
+
out << values[min..max].join(' ')
|
19
|
+
end
|
20
|
+
out.uniq!
|
21
|
+
else
|
22
|
+
range.each {|n| out.concat(ngrams(values, n))}
|
23
|
+
out.uniq!
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Keeps the start order of the `values` of the input `Array` `values`.
|
29
|
+
# It does **not** keep consecutive `values` together (it can jump/skip items).
|
30
|
+
# @param values [Array] the input array with the values.
|
31
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
32
|
+
# @return [Array<Array<Value>>] combinations of `range` length of `values`
|
33
|
+
def combinations(values, range=2..3)
|
34
|
+
if range.is_a?(Integer)
|
35
|
+
values.combination(range).to_a
|
36
|
+
else
|
37
|
+
range.flat_map {|size| values.combination(size).to_a}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# It includes `combinations` that break the initial order of the `Array`.
|
42
|
+
# It does **not** keep consecutive `values` together (it can jump/skip items).
|
43
|
+
# @param values [Array] the input array with the values.
|
44
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
45
|
+
# @return [Array<Array<Value>>] permutations of `range` length of `values`
|
46
|
+
def permutations(values, range=2..3)
|
47
|
+
combinations(values, range).tap do |out|
|
48
|
+
range = range.is_a?(Integer)? (range..range) : range
|
49
|
+
out.dup.select do |item|
|
50
|
+
range.include?(item.length)
|
51
|
+
end.each do |comb|
|
52
|
+
comb.permutation.to_a.tap do |perms|
|
53
|
+
perms.each {|perm| out << perm}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
out.uniq!
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Helper to praper facet structure
|
61
|
+
# @param values1 [Array] the input array with the values to have their facet against.
|
62
|
+
# @param values2 [Array] the input array with the values to facet against.
|
63
|
+
# @return [Hash] where `keys` are `values1` and `value` of each `key` all `values2`
|
64
|
+
def facet(values1, values2)
|
65
|
+
{}.tap do |out|
|
66
|
+
next unless values1.is_a?(Enumerable)
|
67
|
+
values1 = values1.is_a?(Hash) ? values1.values : values1.to_a
|
68
|
+
values1.each {|val| out[val] = values2.dup}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module CharsPositionScore
|
5
|
+
# For each character in `str1`, a search is performed on `str2`.
|
6
|
+
# The search is deemed successful if a character is found in `str2` within `max_distance` characters of the current position.
|
7
|
+
# A score is kept of matching characters.
|
8
|
+
# @note This algorithm is best suited for matching mis-spellings.
|
9
|
+
# @max_distance [Integer] maximum char position distance to score.
|
10
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
11
|
+
# @return [Score] the score object with the result.
|
12
|
+
def chars_position_score(str1, str2, max_distance: 3, normalized: false)
|
13
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
14
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
15
|
+
Score.new(0, 0).tap do |score|
|
16
|
+
next if !str2 || !str1 || str2.empty? || str1.empty?
|
17
|
+
score.total = len1
|
18
|
+
next score.increase(score.total) if str1 == str2
|
19
|
+
next if len1 < 2
|
20
|
+
pos = 0
|
21
|
+
len1.times do |i|
|
22
|
+
start = pos + 1
|
23
|
+
found = false
|
24
|
+
if pos = str2.index(str1[i])
|
25
|
+
if pos < (start + max_distance)
|
26
|
+
found = true
|
27
|
+
score.increase
|
28
|
+
end
|
29
|
+
end
|
30
|
+
pos = start unless found
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,82 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module NGramsScore
|
5
|
+
# It does the following:
|
6
|
+
# 1. It splits both strings into words
|
7
|
+
# 2. Pairs all words by best `ngrams_score` match
|
8
|
+
# 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice)
|
9
|
+
# 4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair
|
10
|
+
# @param range [Integer, Range] determine the lenght of the generated values for each `word`.
|
11
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
12
|
+
# @return [Score] the score object with the result.
|
13
|
+
def words_ngrams_score(str1, str2, range: 3..5, normalized: false)
|
14
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
15
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
16
|
+
|
17
|
+
Score.new(0, 0).tap do |score|
|
18
|
+
next if !str2 || !str1
|
19
|
+
next score.increase_total(len1) if str2.empty? || str1.empty?
|
20
|
+
if str1 == str2
|
21
|
+
score.total = len1
|
22
|
+
score.increase(score.total)
|
23
|
+
end
|
24
|
+
if str1.length < 2 || str1.length < 2
|
25
|
+
score.increase_total(len1)
|
26
|
+
end
|
27
|
+
|
28
|
+
pairs = paired_words(str1, str2, normalized: true) do |needle, item|
|
29
|
+
ngrams_score(needle, item, range: range, normalized: true)
|
30
|
+
end.each do |sub_str1, data|
|
31
|
+
item, iscore = data
|
32
|
+
score.merge!(iscore)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# A score is kept of matching ngram combinations of `str2`.
|
38
|
+
# @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations.
|
39
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
40
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
41
|
+
# @return [Score] the score object with the result.
|
42
|
+
def ngrams_score(str1, str2, range: 3..5, normalized: false)
|
43
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
44
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
45
|
+
|
46
|
+
Score.new(0, len1 || 0).tap do |score|
|
47
|
+
next if !str2 || !str1
|
48
|
+
next if str2.empty? || str1.empty?
|
49
|
+
score.total = len1
|
50
|
+
next score.increase(score.total) if str1 == str2
|
51
|
+
next if str1.length < 2 || str2.length < 2
|
52
|
+
|
53
|
+
grams = word_ngrams(str2, range, normalized: true)
|
54
|
+
grams_count = grams.length
|
55
|
+
next unless grams_count > 0
|
56
|
+
|
57
|
+
if range.is_a?(Integer)
|
58
|
+
item_weight = score.total.to_f / grams_count
|
59
|
+
matches = grams.select {|res| str1.include?(gram)}.length
|
60
|
+
score.increase(matches * item_weight)
|
61
|
+
else
|
62
|
+
groups = grams.group_by {|gram| gram.length}
|
63
|
+
sorted_lens = groups.keys.sort.reverse
|
64
|
+
lens = sorted_lens.length
|
65
|
+
group_weight = (1.0 / lens).round(3)
|
66
|
+
|
67
|
+
groups.each do |len, grams|
|
68
|
+
len_max_score = score.total * group_weight
|
69
|
+
item_weight = len_max_score / grams_count
|
70
|
+
matches = grams.select {|gram| str1.include?(gram)}.length
|
71
|
+
#pp "(#{len}) match: #{matches} (of #{grams.length} of total #{grams_count}) || max_score: #{len_max_score} (over #{score.total})"
|
72
|
+
score.increase(matches * item_weight)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module Pairing
|
5
|
+
|
6
|
+
# Pair words using some algorithm.
|
7
|
+
# It does the following:
|
8
|
+
# 1. It splits both strings into words.
|
9
|
+
# 2. Pairs all words by using `block` to score the best match.
|
10
|
+
# 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice).
|
11
|
+
# 4. Merges the `Score` of all the paired words of `str2` against their `str1` word pair.
|
12
|
+
# @yield [needle, item] offers a comparison algorithm between two strings.
|
13
|
+
# @yieldparam needle [String] the string of reference.
|
14
|
+
# @yieldparam item [String] one of the haystack items.
|
15
|
+
# @yieldreturn [Eco::Data::FuzzyMatch::Score] the `Score` object with the results of comparing `str1` and `str2`
|
16
|
+
# @param str1 [String] the string of reference.
|
17
|
+
# @param str2 [String] one of the haystack items.
|
18
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
19
|
+
# @return [Hash] where `keys` are the **words** of `str1` and their `values` a pair array of `pair` and `Score`
|
20
|
+
def paired_words(str1, str2, normalized: false)
|
21
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
22
|
+
return {nil => [nil, Score.new(0, 0)]} if !str2 || !str1
|
23
|
+
return {str1 => [nil, Score.new(0, 0)]} if str1.length < 2 || str1.length < 2
|
24
|
+
|
25
|
+
needles = get_words(str1, normalized: true)
|
26
|
+
haystack = get_words(str2, normalized: true)
|
27
|
+
|
28
|
+
ranking = {}
|
29
|
+
faceted = needles.each_with_object({}) do |needle, faceted|
|
30
|
+
faceted[needle] = haystack.map do |item|
|
31
|
+
{
|
32
|
+
pair: item,
|
33
|
+
score: yield(needle, item)
|
34
|
+
}.tap do |result|
|
35
|
+
ranking[item] ||= []
|
36
|
+
if result[:score].ratio > 0.05
|
37
|
+
ranking[item] << ({needle: needle, score: result[:score]})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end.sort_by do |result|
|
41
|
+
result[:score].ratio
|
42
|
+
end.reverse
|
43
|
+
end
|
44
|
+
|
45
|
+
paired = {}
|
46
|
+
#scores = {}
|
47
|
+
ranking.each do |item, results|
|
48
|
+
sorted = results.reject do |result|
|
49
|
+
paired.key?(result[:needle])
|
50
|
+
end.sort_by do |result|
|
51
|
+
result[:score].ratio
|
52
|
+
end.reverse
|
53
|
+
if result = sorted.shift
|
54
|
+
unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
|
55
|
+
raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{result[:needle]}' and item #{item}"
|
56
|
+
end
|
57
|
+
paired[result[:needle]] = {
|
58
|
+
pair: item,
|
59
|
+
score: result[:score]
|
60
|
+
}
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
pending_items = haystack - paired.values
|
65
|
+
faceted.reject do |needle, results|
|
66
|
+
paired.key?(needle)
|
67
|
+
end.each do |needle, results|
|
68
|
+
results.select! do |result|
|
69
|
+
pending_items.include?(result[:pair]) && result[:score].ratio > 0.05
|
70
|
+
end
|
71
|
+
if result = results.shift
|
72
|
+
unless result[:score].is_a?(Eco::Data::FuzzyMatch::Score)
|
73
|
+
raise "Parining ('#{str1}' vs '#{str2}') -> Something got sour with needle '#{needle}' and item #{result[:pair]}"
|
74
|
+
end
|
75
|
+
paired[needle] = result
|
76
|
+
pending_items.delete(result[:pair])
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
pending_needles = needles - paired.keys
|
81
|
+
pending_needles.each do |needle|
|
82
|
+
paired[needle] = {
|
83
|
+
pair: nil,
|
84
|
+
score: Score.new(0, needle.length)
|
85
|
+
}
|
86
|
+
end
|
87
|
+
paired.each_with_object({}) do |(needle, data), out|
|
88
|
+
out[needle] = data.values_at(:pair, :score)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|