eco-helpers 2.0.13 → 2.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +87 -2
- data/eco-helpers.gemspec +6 -4
- data/lib/eco-helpers.rb +2 -0
- data/lib/eco/api/common/base_loader.rb +14 -0
- data/lib/eco/api/common/people/default_parsers/date_parser.rb +11 -1
- data/lib/eco/api/common/people/default_parsers/login_providers_parser.rb +1 -1
- data/lib/eco/api/common/people/default_parsers/policy_groups_parser.rb +11 -11
- data/lib/eco/api/common/people/person_entry.rb +9 -2
- data/lib/eco/api/common/people/supervisor_helpers.rb +27 -0
- data/lib/eco/api/common/session/file_manager.rb +2 -2
- data/lib/eco/api/common/session/mailer.rb +0 -1
- data/lib/eco/api/common/session/s3_uploader.rb +0 -1
- data/lib/eco/api/common/session/sftp.rb +0 -1
- data/lib/eco/api/common/version_patches/exception.rb +8 -4
- data/lib/eco/api/error.rb +5 -3
- data/lib/eco/api/microcases.rb +3 -1
- data/lib/eco/api/microcases/append_usergroups.rb +0 -1
- data/lib/eco/api/microcases/people_cache.rb +2 -2
- data/lib/eco/api/microcases/people_load.rb +2 -2
- data/lib/eco/api/microcases/people_refresh.rb +2 -2
- data/lib/eco/api/microcases/people_search.rb +6 -6
- data/lib/eco/api/microcases/preserve_default_tag.rb +23 -0
- data/lib/eco/api/microcases/preserve_filter_tags.rb +28 -0
- data/lib/eco/api/microcases/preserve_policy_groups.rb +30 -0
- data/lib/eco/api/microcases/set_account.rb +0 -1
- data/lib/eco/api/organization.rb +1 -0
- data/lib/eco/api/organization/people.rb +7 -0
- data/lib/eco/api/organization/people_analytics.rb +60 -0
- data/lib/eco/api/organization/presets_factory.rb +116 -93
- data/lib/eco/api/organization/presets_integrity.json +58 -0
- data/lib/eco/api/organization/presets_values.json +5 -4
- data/lib/eco/api/policies/default_policies/99_user_access_policy.rb +0 -30
- data/lib/eco/api/session.rb +1 -20
- data/lib/eco/api/session/batch.rb +23 -7
- data/lib/eco/api/session/batch/job.rb +3 -0
- data/lib/eco/api/session/config.rb +16 -15
- data/lib/eco/api/session/config/api.rb +4 -0
- data/lib/eco/api/session/config/apis.rb +80 -0
- data/lib/eco/api/session/config/files.rb +7 -0
- data/lib/eco/api/session/config/people.rb +3 -19
- data/lib/eco/api/usecases/default_cases.rb +4 -1
- data/lib/eco/api/usecases/default_cases/abstract_policygroup_abilities_case.rb +161 -0
- data/lib/eco/api/usecases/default_cases/analyse_people_case.rb +76 -0
- data/lib/eco/api/usecases/default_cases/codes_to_tags_case.rb +2 -3
- data/lib/eco/api/usecases/default_cases/reset_landing_page_case.rb +11 -1
- data/lib/eco/api/usecases/default_cases/restore_db_case.rb +1 -2
- data/lib/eco/api/usecases/default_cases/supers_cyclic_identify_case.rb +72 -0
- data/lib/eco/api/usecases/default_cases/supers_hierarchy_case.rb +59 -0
- data/lib/eco/api/usecases/default_cases/to_csv_case.rb +104 -26
- data/lib/eco/api/usecases/default_cases/to_csv_detailed_case.rb +62 -36
- data/lib/eco/cli.rb +0 -10
- data/lib/eco/cli/config/default/options.rb +19 -17
- data/lib/eco/cli/config/default/people_filters.rb +3 -3
- data/lib/eco/cli/config/default/usecases.rb +77 -25
- data/lib/eco/cli/config/default/workflow.rb +12 -3
- data/lib/eco/cli/config/help.rb +1 -0
- data/lib/eco/cli/config/options_set.rb +106 -13
- data/lib/eco/cli/config/use_cases.rb +33 -33
- data/lib/eco/cli/scripting/args_helpers.rb +30 -3
- data/lib/eco/data.rb +1 -0
- data/lib/eco/data/crypto/encryption.rb +3 -3
- data/lib/eco/data/files/directory.rb +28 -20
- data/lib/eco/data/files/helpers.rb +6 -4
- data/lib/eco/data/fuzzy_match.rb +119 -0
- data/lib/eco/data/fuzzy_match/array_helpers.rb +75 -0
- data/lib/eco/data/fuzzy_match/chars_position_score.rb +37 -0
- data/lib/eco/data/fuzzy_match/ngrams_score.rb +73 -0
- data/lib/eco/data/fuzzy_match/pairing.rb +102 -0
- data/lib/eco/data/fuzzy_match/result.rb +67 -0
- data/lib/eco/data/fuzzy_match/results.rb +53 -0
- data/lib/eco/data/fuzzy_match/score.rb +44 -0
- data/lib/eco/data/fuzzy_match/stop_words.rb +35 -0
- data/lib/eco/data/fuzzy_match/string_helpers.rb +69 -0
- data/lib/eco/version.rb +1 -1
- metadata +86 -10
- data/lib/eco/api/microcases/refresh_abilities.rb +0 -19
- data/lib/eco/api/organization/presets_reference.json +0 -59
- data/lib/eco/api/usecases/default_cases/refresh_abilities_case.rb +0 -30
@@ -3,11 +3,13 @@ module Eco
|
|
3
3
|
module Files
|
4
4
|
DEFAULT_TIMESTAMP_PATTERN = '%Y-%m-%dT%H%M%S'
|
5
5
|
|
6
|
-
|
7
|
-
base
|
8
|
-
|
6
|
+
class << self
|
7
|
+
def included(base)
|
8
|
+
base.send(:include, InstanceMethods)
|
9
|
+
base.extend(ClassMethods)
|
10
|
+
end
|
9
11
|
end
|
10
|
-
|
12
|
+
|
11
13
|
module InstanceMethods
|
12
14
|
|
13
15
|
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
require 'fuzzy_match'
|
2
|
+
require 'amatch'
|
3
|
+
require 'jaro_winkler'
|
4
|
+
|
5
|
+
require_relative 'fuzzy_match/stop_words'
|
6
|
+
require_relative 'fuzzy_match/array_helpers'
|
7
|
+
require_relative 'fuzzy_match/string_helpers'
|
8
|
+
require_relative 'fuzzy_match/pairing'
|
9
|
+
require_relative 'fuzzy_match/chars_position_score'
|
10
|
+
require_relative 'fuzzy_match/ngrams_score'
|
11
|
+
|
12
|
+
module Eco
|
13
|
+
module Data
|
14
|
+
module FuzzyMatch
|
15
|
+
|
16
|
+
class << self
|
17
|
+
def included(base)
|
18
|
+
base.send(:include, InstanceMethods)
|
19
|
+
base.extend(ClassMethods)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module ClassMethods
|
24
|
+
include ArrayHelpers
|
25
|
+
include StringHelpers
|
26
|
+
include Pairing
|
27
|
+
include CharsPositionScore
|
28
|
+
include NGramsScore
|
29
|
+
|
30
|
+
def jaro_winkler(str1, str2)
|
31
|
+
options = {
|
32
|
+
ignore_case: true,
|
33
|
+
weight: 0.25
|
34
|
+
}
|
35
|
+
JaroWinkler.distance(str1, str2, **options)
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
module InstanceMethods
|
41
|
+
include StopWords
|
42
|
+
|
43
|
+
attr_accessor :fuzzy_options
|
44
|
+
|
45
|
+
def fuzzy_options
|
46
|
+
@fuzzy_options ||= {}
|
47
|
+
end
|
48
|
+
|
49
|
+
def fuzzy_match(haystack = nil, **options)
|
50
|
+
return @fuzzy_match if instance_variable_defined?(:@fuzzy_match)
|
51
|
+
@fuzzy_options = options.merge({
|
52
|
+
stop_words: PREPOSITIONS + PRONOUNS + ARTICLES
|
53
|
+
})
|
54
|
+
# make it run with a native C extension (for better performance: ~130 % increase of performance)
|
55
|
+
::FuzzyMatch.engine = :amatch
|
56
|
+
haystack = obtain_haystack(haystack).tap do |items|
|
57
|
+
if !fuzzy_read_method && found = items.find {|item| !item.is_a?(String)}
|
58
|
+
raise "To use non String objects as 'haystack' you should provide `read:` or `options[:read]`. Given element: #{found.class}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
@fuzzy_match = ::FuzzyMatch.new(haystack, fuzzy_options)
|
62
|
+
end
|
63
|
+
|
64
|
+
# @note
|
65
|
+
# - When the `haystack` elements are **non** `String` objects, it excludes the needle itself from the results
|
66
|
+
# @param needle [String, Object] object is allowed when `fuzzy_options` includes `read:` key
|
67
|
+
# @return [Eco::Data::FuzzyMatch::Results]
|
68
|
+
def find_all_with_score(needle, **options)
|
69
|
+
results = fuzzy_match(**options).find_all_with_score(needle).each_with_object([]) do |fuzzy_results, results|
|
70
|
+
item, dice, lev = fuzzy_results
|
71
|
+
unless item == needle
|
72
|
+
needle_str = item_string(needle)
|
73
|
+
item_str = item_string(item)
|
74
|
+
jaro_res = self.class.jaro_winkler(needle_str, item_str)
|
75
|
+
ngram_res = self.class.ngrams_score(needle_str, item_str, range: 3..5).ratio
|
76
|
+
wngram_res = self.class.words_ngrams_score(needle_str, item_str, range: 3..7).ratio
|
77
|
+
pos_res = self.class.chars_position_score(needle_str, item_str).ratio
|
78
|
+
results << Result.new(item, item_str, dice, lev, jaro_res, ngram_res, wngram_res, pos_res)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
Results.new(needle, item_string(needle), results)
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
# @note
|
87
|
+
# - When used in an `Enumerable` it will use `to_a`, or `values` if it's a `Hash`
|
88
|
+
# @param data [Enumerable, nil]
|
89
|
+
# @return [Array<Object>] the non-repeated values of `data`
|
90
|
+
def obtain_haystack(data = nil)
|
91
|
+
data = self if self.is_a?(Enumerable) && !data
|
92
|
+
raise "'data' should be an Enumerable. Given: #{data.class}" unless data.is_a?(Enumerable)
|
93
|
+
data = self.is_a?(Hash) ? self.values.flatten : to_a.flatten
|
94
|
+
data.uniq.compact
|
95
|
+
end
|
96
|
+
|
97
|
+
def item_string(item, attr = fuzzy_read_method)
|
98
|
+
return item if !item || item.is_a?(String) || !attr
|
99
|
+
attr = attr.to_sym
|
100
|
+
return item.send(attr) if item.respond_to?(attr)
|
101
|
+
end
|
102
|
+
|
103
|
+
def fuzzy_read_method
|
104
|
+
fuzzy_options[:read]
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
class << self
|
110
|
+
include FuzzyMatch::ClassMethods
|
111
|
+
end
|
112
|
+
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
require_relative 'fuzzy_match/score'
|
118
|
+
require_relative 'fuzzy_match/result'
|
119
|
+
require_relative 'fuzzy_match/results'
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module ArrayHelpers
|
5
|
+
# Keeps the start order of the `values` and consecutive `values` together/consecutive.
|
6
|
+
# @param values [Array] the input array with the values.
|
7
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
8
|
+
# @return [Array<Array<Value>>] combinations of `range` length of `values`.
|
9
|
+
def ngrams(values, range=2..3)
|
10
|
+
[].tap do |out|
|
11
|
+
if range.is_a?(Integer)
|
12
|
+
n = range
|
13
|
+
values_count = values.length
|
14
|
+
values.each_with_index do |word, i|
|
15
|
+
min = i
|
16
|
+
max = i + (n - 1)
|
17
|
+
break if values_count <= max
|
18
|
+
out << values[min..max].join(' ')
|
19
|
+
end
|
20
|
+
out.uniq!
|
21
|
+
else
|
22
|
+
range.each {|n| out.concat(ngrams(values, n))}
|
23
|
+
out.uniq!
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Keeps the start order of the `values` of the input `Array` `values`.
|
29
|
+
# It does **not** keep consecutive `values` together (it can jump/skip items).
|
30
|
+
# @param values [Array] the input array with the values.
|
31
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
32
|
+
# @return [Array<Array<Value>>] combinations of `range` length of `values`
|
33
|
+
def combinations(values, range=2..3)
|
34
|
+
if range.is_a?(Integer)
|
35
|
+
values.combination(range).to_a
|
36
|
+
else
|
37
|
+
range.flat_map {|size| values.combination(size).to_a}
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# It includes `combinations` that break the initial order of the `Array`.
|
42
|
+
# It does **not** keep consecutive `values` together (it can jump/skip items).
|
43
|
+
# @param values [Array] the input array with the values.
|
44
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
45
|
+
# @return [Array<Array<Value>>] permutations of `range` length of `values`
|
46
|
+
def permutations(values, range=2..3)
|
47
|
+
combinations(values, range).tap do |out|
|
48
|
+
range = range.is_a?(Integer)? (range..range) : range
|
49
|
+
out.dup.select do |item|
|
50
|
+
range.include?(item.length)
|
51
|
+
end.each do |comb|
|
52
|
+
comb.permutation.to_a.tap do |perms|
|
53
|
+
perms.each {|perm| out << perm}
|
54
|
+
end
|
55
|
+
end
|
56
|
+
out.uniq!
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Helper to praper facet structure
|
61
|
+
# @param values1 [Array] the input array with the values to have their facet against.
|
62
|
+
# @param values2 [Array] the input array with the values to facet against.
|
63
|
+
# @return [Hash] where `keys` are `values1` and `value` of each `key` all `values2`
|
64
|
+
def facet(values1, values2)
|
65
|
+
{}.tap do |out|
|
66
|
+
next unless values1.is_a?(Enumerable)
|
67
|
+
values1 = values1.is_a?(Hash) ? values1.values : values1.to_a
|
68
|
+
values1.each {|val| out[val] = values2.dup}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module CharsPositionScore
|
5
|
+
# For each character in `str1`, a search is performed on `str2`.
|
6
|
+
# The search is deemed successful if a character is found in `str2` within `max_distance` characters of the current position.
|
7
|
+
# A score is kept of matching characters.
|
8
|
+
# @note This algorithm is best suited for matching mis-spellings.
|
9
|
+
# @max_distance [Integer] maximum char position distance to score.
|
10
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
11
|
+
# @return [Score] the score object with the result.
|
12
|
+
def chars_position_score(str1, str2, max_distance: 3, normalized: false)
|
13
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
14
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
15
|
+
Score.new(0, len1 || 0).tap do |score|
|
16
|
+
next if !str1 || !str2
|
17
|
+
next score.increase(score.total) if str1 == str2
|
18
|
+
next if len1 < 2
|
19
|
+
pos = 0
|
20
|
+
len1.times do |i|
|
21
|
+
start = pos + 1
|
22
|
+
found = false
|
23
|
+
if pos = str2.index(str1[i])
|
24
|
+
if pos < (start + max_distance)
|
25
|
+
found = true
|
26
|
+
score.increase
|
27
|
+
end
|
28
|
+
end
|
29
|
+
pos = start unless found
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module NGramsScore
|
5
|
+
# It does the following:
|
6
|
+
# 1. It splits both strings into words
|
7
|
+
# 2. Pairs all words by best `ngrams_score` match
|
8
|
+
# 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice)
|
9
|
+
# 4. Merges the `ngrams_score` of all the paired words of `str2` against their `str1` word pair
|
10
|
+
# @param range [Integer, Range] determine the lenght of the generated values for each `word`.
|
11
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
12
|
+
# @return [Score] the score object with the result.
|
13
|
+
def words_ngrams_score(str1, str2, range: 3..5, normalized: false)
|
14
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
15
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
16
|
+
|
17
|
+
Score.new(0, 0).tap do |score|
|
18
|
+
next if !str2 || !str1
|
19
|
+
next score.increase(score.total) if str1 == str2
|
20
|
+
next if str1.length < 2 || str1.length < 2
|
21
|
+
|
22
|
+
paired_words(str1, str2, normalized: true) do |needle, item|
|
23
|
+
ngrams_score(needle, item, range: range, normalized: true)
|
24
|
+
end.each do |sub_str1, (item, iscore)|
|
25
|
+
#puts "pairs '#{sub_str1}' --> '#{item}' (score: #{iscore.ratio})"
|
26
|
+
score.merge!(iscore)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# A score is kept of matching ngram combinations of `str2`.
|
32
|
+
# @note This algorithm is best suited for matching sentences, or 'firstname lastname' compared with 'lastname firstname' combinations.
|
33
|
+
# @param range [Integer, Range] determine the lenght of the generated values.
|
34
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
35
|
+
# @return [Score] the score object with the result.
|
36
|
+
def ngrams_score(str1, str2, range: 3..5, normalized: false)
|
37
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
38
|
+
len1 = str1 && str1.length; len2 = str2 && str2.length
|
39
|
+
|
40
|
+
Score.new(0, len1 || 0).tap do |score|
|
41
|
+
next if !str2 || !str1
|
42
|
+
next score.increase(score.total) if str1 == str2
|
43
|
+
next if str1.length < 2 || str2.length < 2
|
44
|
+
|
45
|
+
grams = word_ngrams(str2, range, normalized: true)
|
46
|
+
next unless grams.length > 0
|
47
|
+
|
48
|
+
if range.is_a?(Integer)
|
49
|
+
item_weight = score.total.to_f / grams.length
|
50
|
+
matches = grams.select {|res| str1.include?(gram)}.length
|
51
|
+
score.increase(matches * item_weight)
|
52
|
+
else
|
53
|
+
groups = grams.group_by {|gram| gram.length}
|
54
|
+
sorted_lens = groups.keys.sort.reverse
|
55
|
+
lens = sorted_lens.length
|
56
|
+
group_weight = (1.0 / lens).round(3)
|
57
|
+
|
58
|
+
groups.each do |len, grams|
|
59
|
+
len_max_score = score.total * group_weight
|
60
|
+
item_weight = len_max_score / grams.length
|
61
|
+
matches = grams.select {|gram| str1.include?(gram)}.length
|
62
|
+
#pp "#{len} match: #{matches} (over #{grams.length}) || max_score: #{len_max_score} (over #{score.total})"
|
63
|
+
score.increase(matches * item_weight)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
module Pairing
|
5
|
+
|
6
|
+
# Pair words using some algorithm.
|
7
|
+
# It does the following:
|
8
|
+
# 1. It splits both strings into words.
|
9
|
+
# 2. Pairs all words by using `block` to score the best match.
|
10
|
+
# 3. Gives `0` score to those words of `str2` that lost their pair (a word of `str1` cannot be paired twice).
|
11
|
+
# 4. Merges the `Score` of all the paired words of `str2` against their `str1` word pair.
|
12
|
+
# @yield [needle, item] offers a comparison algorithm between two strings.
|
13
|
+
# @yieldparam needle [String] the string of reference.
|
14
|
+
# @yieldparam item [String] one of the haystack items.
|
15
|
+
# @yieldreturn [Eco::Data::FuzzyMatch::Score] the `Score` object with the results of comparing `str1` and `str2`
|
16
|
+
# @param str1 [String] the string of reference.
|
17
|
+
# @param str2 [String] one of the haystack items.
|
18
|
+
# @param format [Symbol] determines the `values` of the returned `Hash`::
|
19
|
+
# 1. `:pair` for just pair
|
20
|
+
# 2. `:score` for just score
|
21
|
+
# 2. `[:pair, :score]` for `Array`
|
22
|
+
# @normalized [Boolean] to avoid double ups in normalizing.
|
23
|
+
# @return [Hash] where `keys` are the **words** of `str1` and their `values`:
|
24
|
+
# 1. if `format` is `:pair` => the `str2` words with highest match.
|
25
|
+
# 2. if `format` is `:score` => the `Score` words with highest match.
|
26
|
+
# 3. if `format` is `[:pair, :score]` => both in an `Array`.
|
27
|
+
def paired_words(str1, str2, format: [:pair, :score], normalized: false)
|
28
|
+
str1, str2 = normalize_string([str1, str2]) unless normalized
|
29
|
+
return {} if !str2 || !str1
|
30
|
+
return score.increase(score.total) if str1 == str2
|
31
|
+
return {str1 => nil} if str1.length < 2 || str1.length < 2
|
32
|
+
|
33
|
+
needles = get_words(str1, normalized: true)
|
34
|
+
haystack = get_words(str2, normalized: true)
|
35
|
+
|
36
|
+
ranking = {}
|
37
|
+
faceted = needles.each_with_object({}) do |needle, faceted|
|
38
|
+
faceted[needle] = haystack.map do |item|
|
39
|
+
{
|
40
|
+
pair: item,
|
41
|
+
score: yield(needle, item)
|
42
|
+
}.tap do |result|
|
43
|
+
ranking[item] ||= []
|
44
|
+
if result[:score].ratio > 0.05
|
45
|
+
ranking[item] << ({needle: needle, score: result[:score]})
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end.sort_by do |result|
|
49
|
+
result[:score].ratio
|
50
|
+
end.reverse
|
51
|
+
end
|
52
|
+
|
53
|
+
paired = {}
|
54
|
+
#scores = {}
|
55
|
+
ranking.each do |item, results|
|
56
|
+
sorted = results.reject do |result|
|
57
|
+
paired.key?(result[:needle])
|
58
|
+
end.sort_by do |result|
|
59
|
+
result[:score].ratio
|
60
|
+
end.reverse
|
61
|
+
if result = sorted.shift
|
62
|
+
paired[result[:needle]] = {
|
63
|
+
pair: item,
|
64
|
+
score: result[:score]
|
65
|
+
}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
pending_items = haystack - paired.values
|
70
|
+
faceted.reject do |needle, results|
|
71
|
+
paired.key?(needle)
|
72
|
+
end.each do |needle, results|
|
73
|
+
results.select! do |result|
|
74
|
+
pending_items.include?(result[:pair]) && result[:score].ratio > 0.05
|
75
|
+
end
|
76
|
+
if result = results.shift
|
77
|
+
paired[needle] = result
|
78
|
+
pending_items.delete(result[:pair])
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
pending_needles = needles - paired.keys
|
83
|
+
pending_needles.each do |needle|
|
84
|
+
paired[needle] = {
|
85
|
+
pair: nil,
|
86
|
+
score: Score.new(0, needle.length)
|
87
|
+
}
|
88
|
+
end
|
89
|
+
paired.transform_values do |result|
|
90
|
+
case format
|
91
|
+
when Array
|
92
|
+
result.values_at(*format)
|
93
|
+
else
|
94
|
+
restult[format]
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Eco
|
2
|
+
module Data
|
3
|
+
module FuzzyMatch
|
4
|
+
class Result < Struct.new(:match, :value, :dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position)
|
5
|
+
ALL_METHODS = [:dice, :levenshtein, :jaro_winkler, :ngrams, :words_ngrams, :chars_position]
|
6
|
+
|
7
|
+
def dice; super&.round(3); end
|
8
|
+
def levenshtein; super&.round(3); end
|
9
|
+
def jaro_winkler; super&.round(3); end
|
10
|
+
def ngrams; super&.round(3); end
|
11
|
+
def words_ngrams; super&.round(3); end
|
12
|
+
def chars_position; super&.round(3); end
|
13
|
+
|
14
|
+
# TODO: print in the order of `order`
|
15
|
+
def print
|
16
|
+
msg = "(Dice: #{dice}) (Lev Dst: #{levenshtein}) "
|
17
|
+
msg << "(Jaro: #{jaro_winkler}) "
|
18
|
+
msg << "(Ngram: #{ngrams}) (WNgrams: #{words_ngrams}) "
|
19
|
+
msg << "(C Pos: #{chars_position}) "
|
20
|
+
msg << "'#{value}'"
|
21
|
+
end
|
22
|
+
|
23
|
+
def all_threshold?(methods = order, threshold = 0.15)
|
24
|
+
return true unless threshold
|
25
|
+
[methods].flatten.compact.all? {|method| threshold?(method, threshold)}
|
26
|
+
end
|
27
|
+
|
28
|
+
def any_threshold?(methods = order, threshold = 0.15)
|
29
|
+
return true unless threshold
|
30
|
+
[methods].flatten.compact.any? {|method| threshold?(method, threshold)}
|
31
|
+
end
|
32
|
+
|
33
|
+
def threshold?(method = :dice, threshold = 0.15)
|
34
|
+
raise "Uknown method '#{method}'" unless self.respond_to?(method)
|
35
|
+
self.send(method) >= threshold
|
36
|
+
end
|
37
|
+
|
38
|
+
def order=(values)
|
39
|
+
@order = [values].flatten.compact.tap do |o|
|
40
|
+
o = [:words_ngrams, :dice] if o.empty?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def order
|
45
|
+
@order ||= [:words_ngrams, :dice]
|
46
|
+
end
|
47
|
+
|
48
|
+
def <=>(result)
|
49
|
+
compare(result)
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def compare(other, order: self.order)
|
55
|
+
return 0 unless method = order.first
|
56
|
+
raise "Uknown method '#{method}'" unless self.respond_to?(method) && other.respond_to?(method)
|
57
|
+
return -1 if self.send(method) > other.send(method)
|
58
|
+
return 1 if self.send(method) < other.send(method)
|
59
|
+
compare(other, order: order[1..-1])
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|